feat: implement parallel AST parsing with rayon

unhappychoice · claude · unhappychoice · commit 48e754e85542 · 2025-08-31T07:20:26.000+09:00
- Add rayon dependency for parallel processing - Refactor CodeExtractor to support parallel file processing - Create static methods to enable thread-safe AST parsing - Implement parallel processing in extract_chunks_with_progress method - Remove unsafe progress reporting to ensure thread safety - Add performance test for parallel parsing functionality Performance improvements: - Process multiple files in parallel using rayon::par_iter() - Each thread creates its own parser instance for thread safety - Maintains existing API compatibility - No caching implementation (as requested) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -39,6 +39,7 @@ ctrlc = "3.4.7"
 uuid = { version = "1.0", features = ["v4"] }
 rand = { version = "0.8", features = ["std_rng"] }
 once_cell = "1.19"
+rayon = "1.8"
 
 [dev-dependencies]
 tempfile = "3.8"
diff --git a/src/extractor/parser.rs b/src/extractor/parser.rs
@@ -2,6 +2,7 @@ use std::path::Path;
 use std::fs;
 use tree_sitter::{Parser, Query, QueryCursor, Node, Tree};
 use ignore::WalkBuilder;
+use rayon::prelude::*;
 use crate::{Result, GitTypeError};
 use super::{CodeChunk, Language, ChunkType, ProgressReporter, NoOpProgressReporter};
 
@@ -22,31 +23,30 @@ impl Default for ExtractionOptions {
     }
 }
 
-pub struct CodeExtractor {
-    rust_parser: Parser,
-    typescript_parser: Parser,
-    python_parser: Parser,
-}
+pub struct CodeExtractor;
 
 impl CodeExtractor {
     pub fn new() -> Result<Self> {
-        let mut rust_parser = Parser::new();
-        rust_parser.set_language(tree_sitter_rust::language())
-            .map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Rust language: {}", e)))?;
-
-        let mut typescript_parser = Parser::new();
-        typescript_parser.set_language(tree_sitter_typescript::language_typescript())
-            .map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set TypeScript language: {}", e)))?;
-
-        let mut python_parser = Parser::new();
-        python_parser.set_language(tree_sitter_python::language())
-            .map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Python language: {}", e)))?;
-
-        Ok(Self {
-            rust_parser,
-            typescript_parser,
-            python_parser,
-        })
+        Ok(Self)
+    }
+
+    fn create_parser_for_language(language: Language) -> Result<Parser> {
+        let mut parser = Parser::new();
+        match language {
+            Language::Rust => {
+                parser.set_language(tree_sitter_rust::language())
+                    .map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Rust language: {}", e)))?;
+            }
+            Language::TypeScript => {
+                parser.set_language(tree_sitter_typescript::language_typescript())
+                    .map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set TypeScript language: {}", e)))?;
+            }
+            Language::Python => {
+                parser.set_language(tree_sitter_python::language())
+                    .map_err(|e| GitTypeError::ExtractionFailed(format!("Failed to set Python language: {}", e)))?;
+            }
+        }
+        Ok(parser)
     }
 
     pub fn extract_chunks(&mut self, repo_path: &Path, options: ExtractionOptions) -> Result<Vec<CodeChunk>> {
@@ -59,8 +59,6 @@ impl CodeExtractor {
         options: ExtractionOptions,
         progress: &P,
     ) -> Result<Vec<CodeChunk>> {
-        let mut chunks = Vec::new();
-        
         progress.set_phase("Scanning repository".to_string());
         
         // Use ignore crate to respect .gitignore files
@@ -83,7 +81,7 @@ impl CodeExtractor {
 
             if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
                 if let Some(language) = Language::from_extension(extension) {
-                    if self.should_process_file(path, &options) {
+                    if Self::should_process_file_static(path, &options) {
                         files_to_process.push((path.to_path_buf(), language));
                     }
                 }
@@ -93,13 +91,18 @@ impl CodeExtractor {
         let total_files = files_to_process.len();
         progress.set_phase("Parsing AST".to_string());
 
-        for (i, (path, language)) in files_to_process.iter().enumerate() {
-            progress.set_file_counts(i + 1, total_files);
-            if i % 3 == 0 { // Update spinner every 3 files to reduce flicker
-                progress.update_spinner();
-            }
-            
-            let file_chunks = self.extract_from_file(path, *language, &options)?;
+        // Process files in parallel using rayon without progress updates during parallel processing
+        let all_chunks: Result<Vec<Vec<CodeChunk>>> = files_to_process
+            .par_iter()
+            .map(|(path, language)| {
+                // Extract chunks from file
+                Self::extract_from_file_static(path, *language, &options)
+            })
+            .collect();
+
+        let all_chunks = all_chunks?;
+        let mut chunks = Vec::new();
+        for file_chunks in all_chunks {
             chunks.extend(file_chunks);
         }
 
@@ -111,6 +114,10 @@ impl CodeExtractor {
     }
 
     fn should_process_file(&self, path: &Path, options: &ExtractionOptions) -> bool {
+        Self::should_process_file_static(path, options)
+    }
+
+    fn should_process_file_static(path: &Path, options: &ExtractionOptions) -> bool {
         let path_str = path.to_string_lossy();
         
         // Check exclude patterns first
@@ -135,17 +142,17 @@ impl CodeExtractor {
     }
 
     pub fn extract_from_file(&mut self, file_path: &Path, language: Language, options: &ExtractionOptions) -> Result<Vec<CodeChunk>> {
+        Self::extract_from_file_static(file_path, language, options)
+    }
+
+    fn extract_from_file_static(file_path: &Path, language: Language, options: &ExtractionOptions) -> Result<Vec<CodeChunk>> {
         let content = fs::read_to_string(file_path)?;
-        let parser = match language {
-            Language::Rust => &mut self.rust_parser,
-            Language::TypeScript => &mut self.typescript_parser,
-            Language::Python => &mut self.python_parser,
-        };
+        let mut parser = Self::create_parser_for_language(language)?;
         
         let tree = parser.parse(&content, None)
             .ok_or_else(|| GitTypeError::ExtractionFailed(format!("Failed to parse file: {:?}", file_path)))?;
         
-        self.extract_chunks_from_tree(&tree, &content, file_path, language, options)
+        Self::extract_chunks_from_tree_static(&tree, &content, file_path, language, options)
     }
 
     fn extract_chunks_from_tree(
@@ -155,11 +162,21 @@ impl CodeExtractor {
         file_path: &Path,
         language: Language,
         options: &ExtractionOptions,
+    ) -> Result<Vec<CodeChunk>> {
+        Self::extract_chunks_from_tree_static(tree, source_code, file_path, language, options)
+    }
+
+    fn extract_chunks_from_tree_static(
+        tree: &Tree,
+        source_code: &str,
+        file_path: &Path,
+        language: Language,
+        options: &ExtractionOptions,
     ) -> Result<Vec<CodeChunk>> {
         let mut chunks = Vec::new();
         
         // Extract comment ranges for the entire file
-        let file_comment_ranges = self.extract_comment_ranges(tree, source_code, language.clone());
+        let file_comment_ranges = Self::extract_comment_ranges_static(tree, source_code, language.clone());
         
         let query_str = match language {
             Language::Rust => "
@@ -191,7 +208,7 @@ impl CodeExtractor {
                 let node = capture.node;
                 let capture_name = &query.capture_names()[capture.index as usize];
                 
-                if let Some(chunk) = self.node_to_chunk(node, source_code, file_path, language.clone(), &capture_name, options, &file_comment_ranges) {
+                if let Some(chunk) = Self::node_to_chunk_static(node, source_code, file_path, language.clone(), &capture_name, options, &file_comment_ranges) {
                     chunks.push(chunk);
                 }
             }
@@ -209,6 +226,18 @@ impl CodeExtractor {
         capture_name: &str,
         options: &ExtractionOptions,
         file_comment_ranges: &[(usize, usize)],
+    ) -> Option<CodeChunk> {
+        Self::node_to_chunk_static(node, source_code, file_path, language, capture_name, options, file_comment_ranges)
+    }
+
+    fn node_to_chunk_static(
+        node: Node,
+        source_code: &str,
+        file_path: &Path,
+        language: Language,
+        capture_name: &str,
+        options: &ExtractionOptions,
+        file_comment_ranges: &[(usize, usize)],
     ) -> Option<CodeChunk> {
         let start_byte = node.start_byte();
         let end_byte = node.end_byte();
@@ -234,7 +263,7 @@ impl CodeExtractor {
             _ => return None,
         };
         
-        let name = self.extract_name(node, source_code).unwrap_or_else(|| "unknown".to_string());
+        let name = Self::extract_name_static(node, source_code).unwrap_or_else(|| "unknown".to_string());
         
         // Filter comment ranges that are within this chunk and make them relative to chunk content
         let chunk_comment_ranges: Vec<(usize, usize)> = file_comment_ranges.iter()
@@ -249,7 +278,7 @@ impl CodeExtractor {
             .collect();
         
         // Normalize indentation based on AST node position
-        let (normalized_content, normalized_comment_ranges) = self.normalize_indentation(
+        let (normalized_content, normalized_comment_ranges) = Self::normalize_indentation_static(
             content,
             original_indentation,
             &chunk_comment_ranges
@@ -269,6 +298,10 @@ impl CodeExtractor {
     }
     
     fn extract_name(&self, node: Node, source_code: &str) -> Option<String> {
+        Self::extract_name_static(node, source_code)
+    }
+
+    fn extract_name_static(node: Node, source_code: &str) -> Option<String> {
         // For variable_declarator, we need to get the name from the first child
         if node.kind() == "variable_declarator" {
             let mut cursor = node.walk();
@@ -301,6 +334,10 @@ impl CodeExtractor {
     }
 
     fn normalize_indentation(&self, content: &str, original_indentation: usize, comment_ranges: &[(usize, usize)]) -> (String, Vec<(usize, usize)>) {
+        Self::normalize_indentation_static(content, original_indentation, comment_ranges)
+    }
+
+    fn normalize_indentation_static(content: &str, original_indentation: usize, comment_ranges: &[(usize, usize)]) -> (String, Vec<(usize, usize)>) {
         let lines: Vec<&str> = content.lines().collect();
         if lines.is_empty() {
             return (content.to_string(), comment_ranges.to_vec());
@@ -396,6 +433,10 @@ impl CodeExtractor {
     }
 
     fn extract_comment_ranges(&self, tree: &Tree, source_code: &str, language: Language) -> Vec<(usize, usize)> {
+        Self::extract_comment_ranges_static(tree, source_code, language)
+    }
+
+    fn extract_comment_ranges_static(tree: &Tree, source_code: &str, language: Language) -> Vec<(usize, usize)> {
         let mut comment_ranges = Vec::new();
         
         let comment_query = match language {
diff --git a/tests/extractor_unit_tests.rs b/tests/extractor_unit_tests.rs
@@ -2,6 +2,7 @@ use gittype::extractor::{CodeExtractor, ExtractionOptions, Language, ChunkType,
 use gittype::GitTypeError;
 use std::fs;
 use std::path::{Path, PathBuf};
+use std::time::Instant;
 use tempfile::TempDir;
 
 // Basic extractor tests
@@ -213,3 +214,70 @@ fn test_repository_not_found() {
     
     assert!(matches!(result, Err(GitTypeError::RepositoryNotFound(_))));
 }
+
+#[test]
+fn test_parallel_ast_parsing_performance() {
+    let temp_dir = TempDir::new().unwrap();
+    
+    // Create multiple test files with different languages
+    for i in 0..10 {
+        let rust_file = temp_dir.path().join(format!("test_{}.rs", i));
+        fs::write(&rust_file, format!(r#"
+fn function_{}() {{
+    println!("Function {{}}", {});
+}}
+
+struct Struct{} {{
+    field: i32,
+}}
+
+impl Struct{} {{
+    fn method_{}(&self) -> i32 {{
+        self.field + {}
+    }}
+}}
+"#, i, i, i, i, i, i)).unwrap();
+
+        let ts_file = temp_dir.path().join(format!("test_{}.ts", i));
+        fs::write(&ts_file, format!(r#"
+function tsFunction{}(x: number): number {{
+    return x * {};
+}}
+
+class TsClass{} {{
+    private value: number = {};
+    
+    public getValue(): number {{
+        return this.value;
+    }}
+}}
+"#, i, i, i, i)).unwrap();
+    }
+
+    let mut extractor = CodeExtractor::new().unwrap();
+    let options = ExtractionOptions::default();
+    
+    let start = Instant::now();
+    let chunks = extractor.extract_chunks(temp_dir.path(), options).unwrap();
+    let duration = start.elapsed();
+    
+    // Should extract functions, structs, impls, and classes from all files
+    assert!(chunks.len() >= 40, "Expected at least 40 chunks, got {}", chunks.len()); // 10 files * (1 fn + 1 struct + 1 impl + 1 ts function + 1 ts class) = 50 minimum
+    
+    println!("Parallel extraction of {} files took {:?}", 20, duration);
+    println!("Found {} total code chunks", chunks.len());
+    
+    // Verify we have different types of chunks
+    let function_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Function)).count();
+    let struct_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Struct)).count();
+    let class_count = chunks.iter().filter(|c| matches!(c.chunk_type, ChunkType::Class)).count();
+    
+    println!("Types found: {} functions, {} structs, {} classes", function_count, struct_count, class_count);
+    
+    assert!(function_count >= 20, "Should find at least 20 functions");
+    assert!(struct_count >= 10, "Should find at least 10 structs");
+    assert!(class_count >= 10, "Should find at least 10 classes");
+    
+    // Performance test - should complete reasonably quickly
+    assert!(duration.as_millis() < 5000, "Parallel parsing should complete within 5 seconds");
+}