Skip to content

Commit d66765d

Browse files
Merge pull request #38 from unhappychoice/feat/parallel-ast-parsing
feat: implement parallel AST parsing for improved performance
2 parents ddb5a67 + 614b0ac commit d66765d

File tree

5 files changed

+189
-44
lines changed

5 files changed

+189
-44
lines changed

Cargo.lock

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ ctrlc = "3.4.7"
3939
uuid = { version = "1.0", features = ["v4"] }
4040
rand = { version = "0.8", features = ["std_rng"] }
4141
once_cell = "1.19"
42+
rayon = "1.8"
4243

4344
[dev-dependencies]
4445
tempfile = "3.8"

src/extractor/parser.rs

Lines changed: 98 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use std::path::Path;
22
use std::fs;
33
use tree_sitter::{Parser, Query, QueryCursor, Node, Tree};
44
use ignore::WalkBuilder;
5+
use rayon::prelude::*;
56
use crate::{Result, GitTypeError};
67
use super::{CodeChunk, Language, ChunkType, ProgressReporter, NoOpProgressReporter};
78

@@ -22,31 +23,30 @@ impl Default for ExtractionOptions {
2223
}
2324
}
2425

25-
pub struct CodeExtractor {
26-
rust_parser: Parser,
27-
typescript_parser: Parser,
28-
python_parser: Parser,
29-
}
26+
pub struct CodeExtractor;
3027

3128
impl CodeExtractor {
3229
pub fn new() -> Result<Self> {
33-
let mut rust_parser = Parser::new();
34-
rust_parser.set_language(tree_sitter_rust::language())
35-
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Rust language: {}", e)))?;
36-
37-
let mut typescript_parser = Parser::new();
38-
typescript_parser.set_language(tree_sitter_typescript::language_typescript())
39-
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set TypeScript language: {}", e)))?;
40-
41-
let mut python_parser = Parser::new();
42-
python_parser.set_language(tree_sitter_python::language())
43-
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Python language: {}", e)))?;
44-
45-
Ok(Self {
46-
rust_parser,
47-
typescript_parser,
48-
python_parser,
49-
})
30+
Ok(Self)
31+
}
32+
33+
fn create_parser_for_language(language: Language) -> Result<Parser> {
34+
let mut parser = Parser::new();
35+
match language {
36+
Language::Rust => {
37+
parser.set_language(tree_sitter_rust::language())
38+
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Rust language: {}", e)))?;
39+
}
40+
Language::TypeScript => {
41+
parser.set_language(tree_sitter_typescript::language_typescript())
42+
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set TypeScript language: {}", e)))?;
43+
}
44+
Language::Python => {
45+
parser.set_language(tree_sitter_python::language())
46+
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Python language: {}", e)))?;
47+
}
48+
}
49+
Ok(parser)
5050
}
5151

5252
pub fn extract_chunks(&mut self, repo_path: &Path, options: ExtractionOptions) -> Result<Vec<CodeChunk>> {
@@ -59,8 +59,6 @@ impl CodeExtractor {
5959
options: ExtractionOptions,
6060
progress: &P,
6161
) -> Result<Vec<CodeChunk>> {
62-
let mut chunks = Vec::new();
63-
6462
progress.set_phase("Scanning repository".to_string());
6563

6664
// Use ignore crate to respect .gitignore files
@@ -83,7 +81,7 @@ impl CodeExtractor {
8381

8482
if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
8583
if let Some(language) = Language::from_extension(extension) {
86-
if self.should_process_file(path, &options) {
84+
if Self::should_process_file_static(path, &options) {
8785
files_to_process.push((path.to_path_buf(), language));
8886
}
8987
}
@@ -93,24 +91,47 @@ impl CodeExtractor {
9391
let total_files = files_to_process.len();
9492
progress.set_phase("Parsing AST".to_string());
9593

96-
for (i, (path, language)) in files_to_process.iter().enumerate() {
97-
progress.set_file_counts(i + 1, total_files);
98-
if i % 3 == 0 { // Update spinner every 3 files to reduce flicker
99-
progress.update_spinner();
100-
}
94+
// Process files in parallel with better progress tracking
95+
// Split files into smaller chunks for better progress visibility
96+
let chunk_size = (total_files / 20).max(1).min(10); // Process in smaller chunks of 1-10 files
97+
let mut all_chunks = Vec::new();
98+
let mut processed_files = 0;
99+
100+
for chunk in files_to_process.chunks(chunk_size) {
101+
// Process this chunk in parallel
102+
let chunk_results: Result<Vec<Vec<CodeChunk>>> = chunk
103+
.par_iter()
104+
.map(|(path, language)| {
105+
Self::extract_from_file_static(path, *language, &options)
106+
})
107+
.collect();
108+
109+
// Update progress after each chunk
110+
processed_files += chunk.len();
111+
progress.set_file_counts(processed_files, total_files);
101112

102-
let file_chunks = self.extract_from_file(path, *language, &options)?;
103-
chunks.extend(file_chunks);
113+
// Update spinner for each chunk to show progress
114+
progress.update_spinner();
115+
116+
// Collect results
117+
let chunk_results = chunk_results?;
118+
for file_chunks in chunk_results {
119+
all_chunks.extend(file_chunks);
120+
}
104121
}
105122

106123
progress.set_file_counts(total_files, total_files);
107124
progress.set_current_file(None);
108125
progress.set_phase("Finalizing".to_string());
109126

110-
Ok(chunks)
127+
Ok(all_chunks)
111128
}
112129

113130
fn should_process_file(&self, path: &Path, options: &ExtractionOptions) -> bool {
131+
Self::should_process_file_static(path, options)
132+
}
133+
134+
fn should_process_file_static(path: &Path, options: &ExtractionOptions) -> bool {
114135
let path_str = path.to_string_lossy();
115136

116137
// Check exclude patterns first
@@ -135,17 +156,17 @@ impl CodeExtractor {
135156
}
136157

137158
pub fn extract_from_file(&mut self, file_path: &Path, language: Language, options: &ExtractionOptions) -> Result<Vec<CodeChunk>> {
159+
Self::extract_from_file_static(file_path, language, options)
160+
}
161+
162+
fn extract_from_file_static(file_path: &Path, language: Language, options: &ExtractionOptions) -> Result<Vec<CodeChunk>> {
138163
let content = fs::read_to_string(file_path)?;
139-
let parser = match language {
140-
Language::Rust => &mut self.rust_parser,
141-
Language::TypeScript => &mut self.typescript_parser,
142-
Language::Python => &mut self.python_parser,
143-
};
164+
let mut parser = Self::create_parser_for_language(language)?;
144165

145166
let tree = parser.parse(&content, None)
146167
.ok_or_else(|| GitTypeError::ExtractionFailed(format!("Failed to parse file: {:?}", file_path)))?;
147168

148-
self.extract_chunks_from_tree(&tree, &content, file_path, language, options)
169+
Self::extract_chunks_from_tree_static(&tree, &content, file_path, language, options)
149170
}
150171

151172
fn extract_chunks_from_tree(
@@ -155,11 +176,21 @@ impl CodeExtractor {
155176
file_path: &Path,
156177
language: Language,
157178
options: &ExtractionOptions,
179+
) -> Result<Vec<CodeChunk>> {
180+
Self::extract_chunks_from_tree_static(tree, source_code, file_path, language, options)
181+
}
182+
183+
fn extract_chunks_from_tree_static(
184+
tree: &Tree,
185+
source_code: &str,
186+
file_path: &Path,
187+
language: Language,
188+
options: &ExtractionOptions,
158189
) -> Result<Vec<CodeChunk>> {
159190
let mut chunks = Vec::new();
160191

161192
// Extract comment ranges for the entire file
162-
let file_comment_ranges = self.extract_comment_ranges(tree, source_code, language.clone());
193+
let file_comment_ranges = Self::extract_comment_ranges_static(tree, source_code, language.clone());
163194

164195
let query_str = match language {
165196
Language::Rust => "
@@ -191,7 +222,7 @@ impl CodeExtractor {
191222
let node = capture.node;
192223
let capture_name = &query.capture_names()[capture.index as usize];
193224

194-
if let Some(chunk) = self.node_to_chunk(node, source_code, file_path, language.clone(), &capture_name, options, &file_comment_ranges) {
225+
if let Some(chunk) = Self::node_to_chunk_static(node, source_code, file_path, language.clone(), &capture_name, options, &file_comment_ranges) {
195226
chunks.push(chunk);
196227
}
197228
}
@@ -209,6 +240,18 @@ impl CodeExtractor {
209240
capture_name: &str,
210241
options: &ExtractionOptions,
211242
file_comment_ranges: &[(usize, usize)],
243+
) -> Option<CodeChunk> {
244+
Self::node_to_chunk_static(node, source_code, file_path, language, capture_name, options, file_comment_ranges)
245+
}
246+
247+
fn node_to_chunk_static(
248+
node: Node,
249+
source_code: &str,
250+
file_path: &Path,
251+
language: Language,
252+
capture_name: &str,
253+
options: &ExtractionOptions,
254+
file_comment_ranges: &[(usize, usize)],
212255
) -> Option<CodeChunk> {
213256
let start_byte = node.start_byte();
214257
let end_byte = node.end_byte();
@@ -234,7 +277,7 @@ impl CodeExtractor {
234277
_ => return None,
235278
};
236279

237-
let name = self.extract_name(node, source_code).unwrap_or_else(|| "unknown".to_string());
280+
let name = Self::extract_name_static(node, source_code).unwrap_or_else(|| "unknown".to_string());
238281

239282
// Filter comment ranges that are within this chunk and make them relative to chunk content
240283
let chunk_comment_ranges: Vec<(usize, usize)> = file_comment_ranges.iter()
@@ -249,7 +292,7 @@ impl CodeExtractor {
249292
.collect();
250293

251294
// Normalize indentation based on AST node position
252-
let (normalized_content, normalized_comment_ranges) = self.normalize_indentation(
295+
let (normalized_content, normalized_comment_ranges) = Self::normalize_indentation_static(
253296
content,
254297
original_indentation,
255298
&chunk_comment_ranges
@@ -269,6 +312,10 @@ impl CodeExtractor {
269312
}
270313

271314
fn extract_name(&self, node: Node, source_code: &str) -> Option<String> {
315+
Self::extract_name_static(node, source_code)
316+
}
317+
318+
fn extract_name_static(node: Node, source_code: &str) -> Option<String> {
272319
// For variable_declarator, we need to get the name from the first child
273320
if node.kind() == "variable_declarator" {
274321
let mut cursor = node.walk();
@@ -301,6 +348,10 @@ impl CodeExtractor {
301348
}
302349

303350
fn normalize_indentation(&self, content: &str, original_indentation: usize, comment_ranges: &[(usize, usize)]) -> (String, Vec<(usize, usize)>) {
351+
Self::normalize_indentation_static(content, original_indentation, comment_ranges)
352+
}
353+
354+
fn normalize_indentation_static(content: &str, original_indentation: usize, comment_ranges: &[(usize, usize)]) -> (String, Vec<(usize, usize)>) {
304355
let lines: Vec<&str> = content.lines().collect();
305356
if lines.is_empty() {
306357
return (content.to_string(), comment_ranges.to_vec());
@@ -396,6 +447,10 @@ impl CodeExtractor {
396447
}
397448

398449
fn extract_comment_ranges(&self, tree: &Tree, source_code: &str, language: Language) -> Vec<(usize, usize)> {
450+
Self::extract_comment_ranges_static(tree, source_code, language)
451+
}
452+
453+
fn extract_comment_ranges_static(tree: &Tree, source_code: &str, language: Language) -> Vec<(usize, usize)> {
399454
let mut comment_ranges = Vec::new();
400455

401456
let comment_query = match language {

src/game/screens/typing_screen.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -513,4 +513,4 @@ impl TypingScreen {
513513
}
514514
}
515515
}
516-
}
516+
}

tests/extractor_unit_tests.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use gittype::extractor::{CodeExtractor, ExtractionOptions, Language, ChunkType,
22
use gittype::GitTypeError;
33
use std::fs;
44
use std::path::{Path, PathBuf};
5+
use std::time::Instant;
56
use tempfile::TempDir;
67

78
// Basic extractor tests
@@ -213,3 +214,70 @@ fn test_repository_not_found() {
213214

214215
assert!(matches!(result, Err(GitTypeError::RepositoryNotFound(_))));
215216
}
217+
218+
#[test]
219+
fn test_parallel_ast_parsing_performance() {
220+
let temp_dir = TempDir::new().unwrap();
221+
222+
// Create multiple test files with different languages
223+
for i in 0..10 {
224+
let rust_file = temp_dir.path().join(format!("test_{}.rs", i));
225+
fs::write(&rust_file, format!(r#"
226+
fn function_{}() {{
227+
println!("Function {{}}", {});
228+
}}
229+
230+
struct Struct{} {{
231+
field: i32,
232+
}}
233+
234+
impl Struct{} {{
235+
fn method_{}(&self) -> i32 {{
236+
self.field + {}
237+
}}
238+
}}
239+
"#, i, i, i, i, i, i)).unwrap();
240+
241+
let ts_file = temp_dir.path().join(format!("test_{}.ts", i));
242+
fs::write(&ts_file, format!(r#"
243+
function tsFunction{}(x: number): number {{
244+
return x * {};
245+
}}
246+
247+
class TsClass{} {{
248+
private value: number = {};
249+
250+
public getValue(): number {{
251+
return this.value;
252+
}}
253+
}}
254+
"#, i, i, i, i)).unwrap();
255+
}
256+
257+
let mut extractor = CodeExtractor::new().unwrap();
258+
let options = ExtractionOptions::default();
259+
260+
let start = Instant::now();
261+
let chunks = extractor.extract_chunks(temp_dir.path(), options).unwrap();
262+
let duration = start.elapsed();
263+
264+
// Should extract functions, structs, impls, and classes from all files
265+
assert!(chunks.len() >= 40, "Expected at least 40 chunks, got {}", chunks.len()); // 10 files * (1 fn + 1 struct + 1 impl + 1 ts function + 1 ts class) = 50 minimum
266+
267+
println!("Parallel extraction of {} files took {:?}", 20, duration);
268+
println!("Found {} total code chunks", chunks.len());
269+
270+
// Verify we have different types of chunks
271+
let function_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Function)).count();
272+
let struct_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Struct)).count();
273+
let class_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Class)).count();
274+
275+
println!("Types found: {} functions, {} structs, {} classes", function_count, struct_count, class_count);
276+
277+
assert!(function_count >= 20, "Should find at least 20 functions");
278+
assert!(struct_count >= 10, "Should find at least 10 structs");
279+
assert!(class_count >= 10, "Should find at least 10 classes");
280+
281+
// Performance test - should complete reasonably quickly
282+
assert!(duration.as_millis() < 5000, "Parallel parsing should complete within 5 seconds");
283+
}

0 commit comments

Comments
 (0)