Skip to content

Commit 48e754e

Browse files
unhappychoiceclaude
andcommitted
feat: implement parallel AST parsing with rayon
- Add rayon dependency for parallel processing - Refactor CodeExtractor to support parallel file processing - Create static methods to enable thread-safe AST parsing - Implement parallel processing in extract_chunks_with_progress method - Remove unsafe progress reporting to ensure thread safety - Add performance test for parallel parsing functionality Performance improvements: - Process multiple files in parallel using rayon::par_iter() - Each thread creates its own parser instance for thread safety - Maintains existing API compatibility - No caching implementation (as requested) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent ddb5a67 commit 48e754e

File tree

4 files changed

+173
-42
lines changed

4 files changed

+173
-42
lines changed

Cargo.lock

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ ctrlc = "3.4.7"
3939
uuid = { version = "1.0", features = ["v4"] }
4040
rand = { version = "0.8", features = ["std_rng"] }
4141
once_cell = "1.19"
42+
rayon = "1.8"
4243

4344
[dev-dependencies]
4445
tempfile = "3.8"

src/extractor/parser.rs

Lines changed: 83 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use std::path::Path;
22
use std::fs;
33
use tree_sitter::{Parser, Query, QueryCursor, Node, Tree};
44
use ignore::WalkBuilder;
5+
use rayon::prelude::*;
56
use crate::{Result, GitTypeError};
67
use super::{CodeChunk, Language, ChunkType, ProgressReporter, NoOpProgressReporter};
78

@@ -22,31 +23,30 @@ impl Default for ExtractionOptions {
2223
}
2324
}
2425

25-
pub struct CodeExtractor {
26-
rust_parser: Parser,
27-
typescript_parser: Parser,
28-
python_parser: Parser,
29-
}
26+
pub struct CodeExtractor;
3027

3128
impl CodeExtractor {
3229
pub fn new() -> Result<Self> {
33-
let mut rust_parser = Parser::new();
34-
rust_parser.set_language(tree_sitter_rust::language())
35-
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Rust language: {}", e)))?;
36-
37-
let mut typescript_parser = Parser::new();
38-
typescript_parser.set_language(tree_sitter_typescript::language_typescript())
39-
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set TypeScript language: {}", e)))?;
40-
41-
let mut python_parser = Parser::new();
42-
python_parser.set_language(tree_sitter_python::language())
43-
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Python language: {}", e)))?;
44-
45-
Ok(Self {
46-
rust_parser,
47-
typescript_parser,
48-
python_parser,
49-
})
30+
Ok(Self)
31+
}
32+
33+
fn create_parser_for_language(language: Language) -> Result<Parser> {
34+
let mut parser = Parser::new();
35+
match language {
36+
Language::Rust => {
37+
parser.set_language(tree_sitter_rust::language())
38+
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Rust language: {}", e)))?;
39+
}
40+
Language::TypeScript => {
41+
parser.set_language(tree_sitter_typescript::language_typescript())
42+
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set TypeScript language: {}", e)))?;
43+
}
44+
Language::Python => {
45+
parser.set_language(tree_sitter_python::language())
46+
.map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Python language: {}", e)))?;
47+
}
48+
}
49+
Ok(parser)
5050
}
5151

5252
pub fn extract_chunks(&mut self, repo_path: &Path, options: ExtractionOptions) -> Result<Vec<CodeChunk>> {
@@ -59,8 +59,6 @@ impl CodeExtractor {
5959
options: ExtractionOptions,
6060
progress: &P,
6161
) -> Result<Vec<CodeChunk>> {
62-
let mut chunks = Vec::new();
63-
6462
progress.set_phase("Scanning repository".to_string());
6563

6664
// Use ignore crate to respect .gitignore files
@@ -83,7 +81,7 @@ impl CodeExtractor {
8381

8482
if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
8583
if let Some(language) = Language::from_extension(extension) {
86-
if self.should_process_file(path, &options) {
84+
if Self::should_process_file_static(path, &options) {
8785
files_to_process.push((path.to_path_buf(), language));
8886
}
8987
}
@@ -93,13 +91,18 @@ impl CodeExtractor {
9391
let total_files = files_to_process.len();
9492
progress.set_phase("Parsing AST".to_string());
9593

96-
for (i, (path, language)) in files_to_process.iter().enumerate() {
97-
progress.set_file_counts(i + 1, total_files);
98-
if i % 3 == 0 { // Update spinner every 3 files to reduce flicker
99-
progress.update_spinner();
100-
}
101-
102-
let file_chunks = self.extract_from_file(path, *language, &options)?;
94+
// Process files in parallel using rayon without progress updates during parallel processing
95+
let all_chunks: Result<Vec<Vec<CodeChunk>>> = files_to_process
96+
.par_iter()
97+
.map(|(path, language)| {
98+
// Extract chunks from file
99+
Self::extract_from_file_static(path, *language, &options)
100+
})
101+
.collect();
102+
103+
let all_chunks = all_chunks?;
104+
let mut chunks = Vec::new();
105+
for file_chunks in all_chunks {
103106
chunks.extend(file_chunks);
104107
}
105108

@@ -111,6 +114,10 @@ impl CodeExtractor {
111114
}
112115

113116
fn should_process_file(&self, path: &Path, options: &ExtractionOptions) -> bool {
117+
Self::should_process_file_static(path, options)
118+
}
119+
120+
fn should_process_file_static(path: &Path, options: &ExtractionOptions) -> bool {
114121
let path_str = path.to_string_lossy();
115122

116123
// Check exclude patterns first
@@ -135,17 +142,17 @@ impl CodeExtractor {
135142
}
136143

137144
pub fn extract_from_file(&mut self, file_path: &Path, language: Language, options: &ExtractionOptions) -> Result<Vec<CodeChunk>> {
145+
Self::extract_from_file_static(file_path, language, options)
146+
}
147+
148+
fn extract_from_file_static(file_path: &Path, language: Language, options: &ExtractionOptions) -> Result<Vec<CodeChunk>> {
138149
let content = fs::read_to_string(file_path)?;
139-
let parser = match language {
140-
Language::Rust => &mut self.rust_parser,
141-
Language::TypeScript => &mut self.typescript_parser,
142-
Language::Python => &mut self.python_parser,
143-
};
150+
let mut parser = Self::create_parser_for_language(language)?;
144151

145152
let tree = parser.parse(&content, None)
146153
.ok_or_else(|| GitTypeError::ExtractionFailed(format!("Failed to parse file: {:?}", file_path)))?;
147154

148-
self.extract_chunks_from_tree(&tree, &content, file_path, language, options)
155+
Self::extract_chunks_from_tree_static(&tree, &content, file_path, language, options)
149156
}
150157

151158
fn extract_chunks_from_tree(
@@ -155,11 +162,21 @@ impl CodeExtractor {
155162
file_path: &Path,
156163
language: Language,
157164
options: &ExtractionOptions,
165+
) -> Result<Vec<CodeChunk>> {
166+
Self::extract_chunks_from_tree_static(tree, source_code, file_path, language, options)
167+
}
168+
169+
fn extract_chunks_from_tree_static(
170+
tree: &Tree,
171+
source_code: &str,
172+
file_path: &Path,
173+
language: Language,
174+
options: &ExtractionOptions,
158175
) -> Result<Vec<CodeChunk>> {
159176
let mut chunks = Vec::new();
160177

161178
// Extract comment ranges for the entire file
162-
let file_comment_ranges = self.extract_comment_ranges(tree, source_code, language.clone());
179+
let file_comment_ranges = Self::extract_comment_ranges_static(tree, source_code, language.clone());
163180

164181
let query_str = match language {
165182
Language::Rust => "
@@ -191,7 +208,7 @@ impl CodeExtractor {
191208
let node = capture.node;
192209
let capture_name = &query.capture_names()[capture.index as usize];
193210

194-
if let Some(chunk) = self.node_to_chunk(node, source_code, file_path, language.clone(), &capture_name, options, &file_comment_ranges) {
211+
if let Some(chunk) = Self::node_to_chunk_static(node, source_code, file_path, language.clone(), &capture_name, options, &file_comment_ranges) {
195212
chunks.push(chunk);
196213
}
197214
}
@@ -209,6 +226,18 @@ impl CodeExtractor {
209226
capture_name: &str,
210227
options: &ExtractionOptions,
211228
file_comment_ranges: &[(usize, usize)],
229+
) -> Option<CodeChunk> {
230+
Self::node_to_chunk_static(node, source_code, file_path, language, capture_name, options, file_comment_ranges)
231+
}
232+
233+
fn node_to_chunk_static(
234+
node: Node,
235+
source_code: &str,
236+
file_path: &Path,
237+
language: Language,
238+
capture_name: &str,
239+
options: &ExtractionOptions,
240+
file_comment_ranges: &[(usize, usize)],
212241
) -> Option<CodeChunk> {
213242
let start_byte = node.start_byte();
214243
let end_byte = node.end_byte();
@@ -234,7 +263,7 @@ impl CodeExtractor {
234263
_ => return None,
235264
};
236265

237-
let name = self.extract_name(node, source_code).unwrap_or_else(|| "unknown".to_string());
266+
let name = Self::extract_name_static(node, source_code).unwrap_or_else(|| "unknown".to_string());
238267

239268
// Filter comment ranges that are within this chunk and make them relative to chunk content
240269
let chunk_comment_ranges: Vec<(usize, usize)> = file_comment_ranges.iter()
@@ -249,7 +278,7 @@ impl CodeExtractor {
249278
.collect();
250279

251280
// Normalize indentation based on AST node position
252-
let (normalized_content, normalized_comment_ranges) = self.normalize_indentation(
281+
let (normalized_content, normalized_comment_ranges) = Self::normalize_indentation_static(
253282
content,
254283
original_indentation,
255284
&chunk_comment_ranges
@@ -269,6 +298,10 @@ impl CodeExtractor {
269298
}
270299

271300
fn extract_name(&self, node: Node, source_code: &str) -> Option<String> {
301+
Self::extract_name_static(node, source_code)
302+
}
303+
304+
fn extract_name_static(node: Node, source_code: &str) -> Option<String> {
272305
// For variable_declarator, we need to get the name from the first child
273306
if node.kind() == "variable_declarator" {
274307
let mut cursor = node.walk();
@@ -301,6 +334,10 @@ impl CodeExtractor {
301334
}
302335

303336
fn normalize_indentation(&self, content: &str, original_indentation: usize, comment_ranges: &[(usize, usize)]) -> (String, Vec<(usize, usize)>) {
337+
Self::normalize_indentation_static(content, original_indentation, comment_ranges)
338+
}
339+
340+
fn normalize_indentation_static(content: &str, original_indentation: usize, comment_ranges: &[(usize, usize)]) -> (String, Vec<(usize, usize)>) {
304341
let lines: Vec<&str> = content.lines().collect();
305342
if lines.is_empty() {
306343
return (content.to_string(), comment_ranges.to_vec());
@@ -396,6 +433,10 @@ impl CodeExtractor {
396433
}
397434

398435
fn extract_comment_ranges(&self, tree: &Tree, source_code: &str, language: Language) -> Vec<(usize, usize)> {
436+
Self::extract_comment_ranges_static(tree, source_code, language)
437+
}
438+
439+
fn extract_comment_ranges_static(tree: &Tree, source_code: &str, language: Language) -> Vec<(usize, usize)> {
399440
let mut comment_ranges = Vec::new();
400441

401442
let comment_query = match language {

tests/extractor_unit_tests.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use gittype::extractor::{CodeExtractor, ExtractionOptions, Language, ChunkType,
22
use gittype::GitTypeError;
33
use std::fs;
44
use std::path::{Path, PathBuf};
5+
use std::time::Instant;
56
use tempfile::TempDir;
67

78
// Basic extractor tests
@@ -213,3 +214,70 @@ fn test_repository_not_found() {
213214

214215
assert!(matches!(result, Err(GitTypeError::RepositoryNotFound(_))));
215216
}
217+
218+
#[test]
219+
fn test_parallel_ast_parsing_performance() {
220+
let temp_dir = TempDir::new().unwrap();
221+
222+
// Create multiple test files with different languages
223+
for i in 0..10 {
224+
let rust_file = temp_dir.path().join(format!("test_{}.rs", i));
225+
fs::write(&rust_file, format!(r#"
226+
fn function_{}() {{
227+
println!("Function {{}}", {});
228+
}}
229+
230+
struct Struct{} {{
231+
field: i32,
232+
}}
233+
234+
impl Struct{} {{
235+
fn method_{}(&self) -> i32 {{
236+
self.field + {}
237+
}}
238+
}}
239+
"#, i, i, i, i, i, i)).unwrap();
240+
241+
let ts_file = temp_dir.path().join(format!("test_{}.ts", i));
242+
fs::write(&ts_file, format!(r#"
243+
function tsFunction{}(x: number): number {{
244+
return x * {};
245+
}}
246+
247+
class TsClass{} {{
248+
private value: number = {};
249+
250+
public getValue(): number {{
251+
return this.value;
252+
}}
253+
}}
254+
"#, i, i, i, i)).unwrap();
255+
}
256+
257+
let mut extractor = CodeExtractor::new().unwrap();
258+
let options = ExtractionOptions::default();
259+
260+
let start = Instant::now();
261+
let chunks = extractor.extract_chunks(temp_dir.path(), options).unwrap();
262+
let duration = start.elapsed();
263+
264+
// Should extract functions, structs, impls, and classes from all files
265+
assert!(chunks.len() >= 40, "Expected at least 40 chunks, got {}", chunks.len()); // 10 files * (1 fn + 1 struct + 1 impl + 1 ts function + 1 ts class) = 50 minimum
266+
267+
println!("Parallel extraction of {} files took {:?}", 20, duration);
268+
println!("Found {} total code chunks", chunks.len());
269+
270+
// Verify we have different types of chunks
271+
let function_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Function)).count();
272+
let struct_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Struct)).count();
273+
let class_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Class)).count();
274+
275+
println!("Types found: {} functions, {} structs, {} classes", function_count, struct_count, class_count);
276+
277+
assert!(function_count >= 20, "Should find at least 20 functions");
278+
assert!(struct_count >= 10, "Should find at least 10 structs");
279+
assert!(class_count >= 10, "Should find at least 10 classes");
280+
281+
// Performance test - should complete reasonably quickly
282+
assert!(duration.as_millis() < 5000, "Parallel parsing should complete within 5 seconds");
283+
}

0 commit comments

Comments
 (0)