Skip to content

Commit afb6767

Browse files
Merge pull request #106 from unhappychoice/perf/parallelize-challenge-conversion
perf(extracto(: parallelize challenge conversion and zen file processor
2 parents f8ccbbf + b5f84e0 commit afb6767

File tree

4 files changed

+139
-88
lines changed

4 files changed

+139
-88
lines changed

src/extractor/challenge_converter.rs

Lines changed: 64 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
use super::{CodeChunk, ProgressReporter};
22
use crate::game::Challenge;
3+
use rayon::prelude::*;
4+
use std::sync::{
5+
atomic::{AtomicUsize, Ordering},
6+
Arc,
7+
};
38
use uuid::Uuid;
49

510
struct NoOpProgressReporter;
@@ -44,35 +49,40 @@ impl ChallengeConverter {
4449
chunks: Vec<CodeChunk>,
4550
progress: &dyn ProgressReporter,
4651
) -> Vec<Challenge> {
47-
let mut all_challenges = Vec::new();
48-
let total_chunks = chunks.len();
49-
50-
for (chunk_index, chunk) in chunks.iter().enumerate() {
51-
// Update progress
52-
let chunk_progress = chunk_index as f64 / total_chunks as f64;
53-
progress.set_progress(chunk_progress);
54-
progress.set_file_counts(chunk_index, total_chunks);
55-
56-
// Generate challenges for Easy (~100), Normal (~200), Hard (~500), Wild (full chunks) only
57-
let difficulties = [
58-
super::super::game::stage_builder::DifficultyLevel::Easy,
59-
super::super::game::stage_builder::DifficultyLevel::Normal,
60-
super::super::game::stage_builder::DifficultyLevel::Hard,
61-
super::super::game::stage_builder::DifficultyLevel::Wild,
62-
];
63-
64-
for difficulty in &difficulties {
65-
let split_challenges = self.split_chunk_by_difficulty(chunk, difficulty);
66-
all_challenges.extend(split_challenges);
67-
}
68-
}
52+
// Parallelize per-chunk conversion with atomic progress updates
53+
let total = chunks.len();
54+
let processed = Arc::new(AtomicUsize::new(0));
55+
56+
let difficulties = [
57+
super::super::game::stage_builder::DifficultyLevel::Easy,
58+
super::super::game::stage_builder::DifficultyLevel::Normal,
59+
super::super::game::stage_builder::DifficultyLevel::Hard,
60+
super::super::game::stage_builder::DifficultyLevel::Wild,
61+
];
62+
63+
// Initialize progress bounds (0 processed yet)
64+
progress.set_file_counts(0, total);
65+
66+
let all: Vec<Challenge> = chunks
67+
.into_par_iter()
68+
.flat_map(|chunk| {
69+
let mut local = Vec::new();
70+
for difficulty in &difficulties {
71+
let split = self.split_chunk_by_difficulty(&chunk, difficulty);
72+
local.extend(split);
73+
}
6974

70-
// Final progress update
75+
// Track progress count without calling reporter from parallel context
76+
let _ = processed.fetch_add(1, Ordering::Relaxed);
77+
local
78+
})
79+
.collect();
80+
81+
// Final progress update only (avoid Sync bound on ProgressReporter)
82+
progress.set_file_counts(total, total);
7183
progress.set_progress(1.0);
72-
progress.set_file_counts(total_chunks, total_chunks);
7384

74-
// Zen challenges are now handled separately in main.rs
75-
all_challenges
85+
all
7686
}
7787

7888
pub fn convert_with_filter<F>(&self, chunks: Vec<CodeChunk>, filter: F) -> Vec<Challenge>
@@ -91,40 +101,35 @@ impl ChallengeConverter {
91101
chunks: Vec<CodeChunk>,
92102
difficulty: &super::super::game::stage_builder::DifficultyLevel,
93103
) -> Vec<Challenge> {
94-
let mut challenges = Vec::new();
95-
96-
for chunk in chunks {
97-
let split_challenges = self.split_chunk_by_difficulty(&chunk, difficulty);
98-
challenges.extend(split_challenges);
99-
}
100-
101-
challenges
104+
chunks
105+
.into_par_iter()
106+
.flat_map(|chunk| self.split_chunk_by_difficulty(&chunk, difficulty))
107+
.collect()
102108
}
103109

104110
pub fn convert_whole_files_to_challenges(
105111
&self,
106112
file_paths: Vec<std::path::PathBuf>,
107113
) -> Vec<Challenge> {
108114
use super::super::game::stage_builder::DifficultyLevel;
109-
let mut challenges = Vec::new();
110-
111-
for file_path in file_paths {
112-
if let Ok(content) = std::fs::read_to_string(&file_path) {
115+
file_paths
116+
.into_par_iter()
117+
.filter_map(|file_path| {
118+
std::fs::read_to_string(&file_path)
119+
.ok()
120+
.map(|c| (file_path, c))
121+
})
122+
.map(|(file_path, content)| {
113123
let id = Uuid::new_v4().to_string();
114124
let language = super::Language::detect_from_path(&file_path);
115125
let file_path_str = file_path.to_string_lossy().to_string();
116-
117126
let line_count = content.lines().count();
118-
let challenge = Challenge::new(id, content)
127+
Challenge::new(id, content)
119128
.with_source_info(file_path_str, 1, line_count)
120129
.with_language(language)
121-
.with_difficulty_level(DifficultyLevel::Zen);
122-
123-
challenges.push(challenge);
124-
}
125-
}
126-
127-
challenges
130+
.with_difficulty_level(DifficultyLevel::Zen)
131+
})
132+
.collect()
128133
}
129134

130135
fn split_chunk_by_difficulty(
@@ -336,24 +341,23 @@ impl ChallengeConverter {
336341
&self,
337342
file_paths: Vec<std::path::PathBuf>,
338343
) -> Vec<Challenge> {
339-
let mut zen_challenges = Vec::new();
340-
341-
for file_path in file_paths {
342-
if let Ok(content) = std::fs::read_to_string(&file_path) {
344+
file_paths
345+
.into_par_iter()
346+
.filter_map(|file_path| {
347+
std::fs::read_to_string(&file_path)
348+
.ok()
349+
.map(|c| (file_path, c))
350+
})
351+
.map(|(file_path, content)| {
343352
let id = uuid::Uuid::new_v4().to_string();
344353
let language = super::Language::detect_from_path(&file_path);
345354
let file_path_str = file_path.to_string_lossy().to_string();
346-
347355
let line_count = content.lines().count();
348-
let challenge = Challenge::new(id, content)
356+
Challenge::new(id, content)
349357
.with_source_info(file_path_str, 1, line_count)
350358
.with_language(language)
351-
.with_difficulty_level(super::super::game::stage_builder::DifficultyLevel::Zen);
352-
353-
zen_challenges.push(challenge);
354-
}
355-
}
356-
357-
zen_challenges
359+
.with_difficulty_level(super::super::game::stage_builder::DifficultyLevel::Zen)
360+
})
361+
.collect()
358362
}
359363
}

src/extractor/core/extractor.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::extractor::models::{CodeChunk, Language};
2-
use crate::extractor::parsers::get_parser_registry;
2+
use crate::extractor::parsers::{get_parser_registry, parse_with_thread_local};
33
use crate::{GitTypeError, Result};
44
use std::fs;
55
use std::path::Path;
@@ -46,10 +46,8 @@ impl CommonExtractor {
4646

4747
pub fn extract_from_file(file_path: &Path, language: Language) -> Result<Vec<CodeChunk>> {
4848
let content = fs::read_to_string(file_path)?;
49-
let registry = get_parser_registry();
50-
let mut parser = registry.create_parser(language)?;
51-
52-
let tree = parser.parse(&content, None).ok_or_else(|| {
49+
// Reuse per-thread parser instance for the language
50+
let tree = parse_with_thread_local(language, &content).ok_or_else(|| {
5351
GitTypeError::ExtractionFailed(format!("Failed to parse file: {:?}", file_path))
5452
})?;
5553

src/extractor/parser.rs

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,18 @@ impl CodeExtractor {
2929
) -> Result<Vec<CodeChunk>> {
3030
progress.set_step(crate::game::models::loading_steps::StepType::Scanning);
3131

32+
// Compile glob patterns once for faster matching
33+
let include_patterns: Vec<glob::Pattern> = _options
34+
.include_patterns
35+
.iter()
36+
.filter_map(|p| glob::Pattern::new(p).ok())
37+
.collect();
38+
let exclude_patterns: Vec<glob::Pattern> = _options
39+
.exclude_patterns
40+
.iter()
41+
.filter_map(|p| glob::Pattern::new(p).ok())
42+
.collect();
43+
3244
// First pass: count total files to process
3345
let walker_count = WalkBuilder::new(repo_path)
3446
.hidden(false) // Include hidden files
@@ -50,7 +62,11 @@ impl CodeExtractor {
5062

5163
if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
5264
if let Some(_language) = Language::from_extension(extension) {
53-
if Self::should_process_file_static(path, &_options) {
65+
if Self::should_process_file_compiled(
66+
path,
67+
&include_patterns,
68+
&exclude_patterns,
69+
) {
5470
total_files_to_process += 1;
5571
}
5672
}
@@ -79,7 +95,11 @@ impl CodeExtractor {
7995

8096
if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
8197
if let Some(language) = Language::from_extension(extension) {
82-
if Self::should_process_file_static(path, &_options) {
98+
if Self::should_process_file_compiled(
99+
path,
100+
&include_patterns,
101+
&exclude_patterns,
102+
) {
83103
files_to_process.push((path.to_path_buf(), language));
84104
processed_count += 1;
85105

@@ -142,30 +162,33 @@ impl CodeExtractor {
142162
Ok(all_chunks)
143163
}
144164

165+
#[allow(dead_code)]
145166
fn should_process_file_static(path: &Path, _options: &ExtractionOptions) -> bool {
146-
let path_str = path.to_string_lossy();
167+
// Kept for backward compatibility; build patterns on the fly (slower)
168+
let include_patterns: Vec<glob::Pattern> = _options
169+
.include_patterns
170+
.iter()
171+
.filter_map(|p| glob::Pattern::new(p).ok())
172+
.collect();
173+
let exclude_patterns: Vec<glob::Pattern> = _options
174+
.exclude_patterns
175+
.iter()
176+
.filter_map(|p| glob::Pattern::new(p).ok())
177+
.collect();
178+
Self::should_process_file_compiled(path, &include_patterns, &exclude_patterns)
179+
}
147180

148-
// Check exclude patterns first
149-
for pattern in &_options.exclude_patterns {
150-
if glob::Pattern::new(pattern)
151-
.map(|p| p.matches(&path_str))
152-
.unwrap_or(false)
153-
{
154-
return false;
155-
}
156-
}
181+
fn should_process_file_compiled(
182+
path: &Path,
183+
include_patterns: &[glob::Pattern],
184+
exclude_patterns: &[glob::Pattern],
185+
) -> bool {
186+
let path_str = path.to_string_lossy();
157187

158-
// Check include patterns
159-
for pattern in &_options.include_patterns {
160-
if glob::Pattern::new(pattern)
161-
.map(|p| p.matches(&path_str))
162-
.unwrap_or(false)
163-
{
164-
return true;
165-
}
188+
if exclude_patterns.iter().any(|p| p.matches(&path_str)) {
189+
return false;
166190
}
167-
168-
false
191+
include_patterns.iter().any(|p| p.matches(&path_str))
169192
}
170193

171194
pub fn extract_from_file(

src/extractor/parsers/mod.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
use crate::extractor::models::{ChunkType, Language};
22
use crate::{GitTypeError, Result};
33
use once_cell::sync::Lazy;
4+
use std::cell::RefCell;
45
use std::collections::HashMap;
5-
use tree_sitter::{Node, Parser, Query};
6+
use tree_sitter::{Node, Parser, Query, Tree};
67

78
pub mod csharp;
89
pub mod go;
@@ -165,3 +166,28 @@ static REGISTRY: Lazy<ParserRegistry> = Lazy::new(ParserRegistry::new);
165166
pub fn get_parser_registry() -> &'static ParserRegistry {
166167
&REGISTRY
167168
}
169+
170+
thread_local! {
171+
static TL_PARSERS: RefCell<std::collections::HashMap<Language, Parser>> = RefCell::new(std::collections::HashMap::new());
172+
}
173+
174+
/// Parse source using a thread-local parser per language to avoid re-allocations.
175+
pub fn parse_with_thread_local(language: Language, content: &str) -> Option<Tree> {
176+
TL_PARSERS.with(|cell| {
177+
let mut map = cell.borrow_mut();
178+
let parser = match map.get_mut(&language) {
179+
Some(p) => p,
180+
None => {
181+
// Create and insert parser if not exists
182+
match REGISTRY.create_parser(language) {
183+
Ok(p) => {
184+
map.insert(language, p);
185+
map.get_mut(&language).unwrap()
186+
}
187+
Err(_) => return None,
188+
}
189+
}
190+
};
191+
parser.parse(content, None)
192+
})
193+
}

0 commit comments

Comments
 (0)