@@ -2,6 +2,7 @@ use std::path::Path;
22use std:: fs;
33use tree_sitter:: { Parser , Query , QueryCursor , Node , Tree } ;
44use ignore:: WalkBuilder ;
5+ use rayon:: prelude:: * ;
56use crate :: { Result , GitTypeError } ;
67use super :: { CodeChunk , Language , ChunkType , ProgressReporter , NoOpProgressReporter } ;
78
@@ -22,31 +23,30 @@ impl Default for ExtractionOptions {
2223 }
2324}
2425
25- pub struct CodeExtractor {
26- rust_parser : Parser ,
27- typescript_parser : Parser ,
28- python_parser : Parser ,
29- }
26+ pub struct CodeExtractor ;
3027
3128impl CodeExtractor {
3229 pub fn new ( ) -> Result < Self > {
33- let mut rust_parser = Parser :: new ( ) ;
34- rust_parser. set_language ( tree_sitter_rust:: language ( ) )
35- . map_err ( |e| GitTypeError :: ExtractionFailed ( format ! ( "Failed to set Rust language: {}" , e) ) ) ?;
36-
37- let mut typescript_parser = Parser :: new ( ) ;
38- typescript_parser. set_language ( tree_sitter_typescript:: language_typescript ( ) )
39- . map_err ( |e| GitTypeError :: ExtractionFailed ( format ! ( "Failed to set TypeScript language: {}" , e) ) ) ?;
40-
41- let mut python_parser = Parser :: new ( ) ;
42- python_parser. set_language ( tree_sitter_python:: language ( ) )
43- . map_err ( |e| GitTypeError :: ExtractionFailed ( format ! ( "Failed to set Python language: {}" , e) ) ) ?;
44-
45- Ok ( Self {
46- rust_parser,
47- typescript_parser,
48- python_parser,
49- } )
30+ Ok ( Self )
31+ }
32+
33+ fn create_parser_for_language ( language : Language ) -> Result < Parser > {
34+ let mut parser = Parser :: new ( ) ;
35+ match language {
36+ Language :: Rust => {
37+ parser. set_language ( tree_sitter_rust:: language ( ) )
38+ . map_err ( |e| GitTypeError :: ExtractionFailed ( format ! ( "Failed to set Rust language: {}" , e) ) ) ?;
39+ }
40+ Language :: TypeScript => {
41+ parser. set_language ( tree_sitter_typescript:: language_typescript ( ) )
42+ . map_err ( |e| GitTypeError :: ExtractionFailed ( format ! ( "Failed to set TypeScript language: {}" , e) ) ) ?;
43+ }
44+ Language :: Python => {
45+ parser. set_language ( tree_sitter_python:: language ( ) )
46+ . map_err ( |e| GitTypeError :: ExtractionFailed ( format ! ( "Failed to set Python language: {}" , e) ) ) ?;
47+ }
48+ }
49+ Ok ( parser)
5050 }
5151
5252 pub fn extract_chunks ( & mut self , repo_path : & Path , options : ExtractionOptions ) -> Result < Vec < CodeChunk > > {
@@ -59,8 +59,6 @@ impl CodeExtractor {
5959 options : ExtractionOptions ,
6060 progress : & P ,
6161 ) -> Result < Vec < CodeChunk > > {
62- let mut chunks = Vec :: new ( ) ;
63-
6462 progress. set_phase ( "Scanning repository" . to_string ( ) ) ;
6563
6664 // Use ignore crate to respect .gitignore files
@@ -83,7 +81,7 @@ impl CodeExtractor {
8381
8482 if let Some ( extension) = path. extension ( ) . and_then ( |e| e. to_str ( ) ) {
8583 if let Some ( language) = Language :: from_extension ( extension) {
86- if self . should_process_file ( path, & options) {
84+ if Self :: should_process_file_static ( path, & options) {
8785 files_to_process. push ( ( path. to_path_buf ( ) , language) ) ;
8886 }
8987 }
@@ -93,24 +91,47 @@ impl CodeExtractor {
9391 let total_files = files_to_process. len ( ) ;
9492 progress. set_phase ( "Parsing AST" . to_string ( ) ) ;
9593
96- for ( i, ( path, language) ) in files_to_process. iter ( ) . enumerate ( ) {
97- progress. set_file_counts ( i + 1 , total_files) ;
98- if i % 3 == 0 { // Update spinner every 3 files to reduce flicker
99- progress. update_spinner ( ) ;
100- }
94+ // Process files in parallel with better progress tracking
95+ // Split files into smaller chunks for better progress visibility
96+ let chunk_size = ( total_files / 20 ) . max ( 1 ) . min ( 10 ) ; // Process in smaller chunks of 1-10 files
97+ let mut all_chunks = Vec :: new ( ) ;
98+ let mut processed_files = 0 ;
99+
100+ for chunk in files_to_process. chunks ( chunk_size) {
101+ // Process this chunk in parallel
102+ let chunk_results: Result < Vec < Vec < CodeChunk > > > = chunk
103+ . par_iter ( )
104+ . map ( |( path, language) | {
105+ Self :: extract_from_file_static ( path, * language, & options)
106+ } )
107+ . collect ( ) ;
108+
109+ // Update progress after each chunk
110+ processed_files += chunk. len ( ) ;
111+ progress. set_file_counts ( processed_files, total_files) ;
101112
102- let file_chunks = self . extract_from_file ( path, * language, & options) ?;
103- chunks. extend ( file_chunks) ;
113+ // Update spinner for each chunk to show progress
114+ progress. update_spinner ( ) ;
115+
116+ // Collect results
117+ let chunk_results = chunk_results?;
118+ for file_chunks in chunk_results {
119+ all_chunks. extend ( file_chunks) ;
120+ }
104121 }
105122
106123 progress. set_file_counts ( total_files, total_files) ;
107124 progress. set_current_file ( None ) ;
108125 progress. set_phase ( "Finalizing" . to_string ( ) ) ;
109126
110- Ok ( chunks )
127+ Ok ( all_chunks )
111128 }
112129
113130 fn should_process_file ( & self , path : & Path , options : & ExtractionOptions ) -> bool {
131+ Self :: should_process_file_static ( path, options)
132+ }
133+
134+ fn should_process_file_static ( path : & Path , options : & ExtractionOptions ) -> bool {
114135 let path_str = path. to_string_lossy ( ) ;
115136
116137 // Check exclude patterns first
@@ -135,17 +156,17 @@ impl CodeExtractor {
135156 }
136157
137158 pub fn extract_from_file ( & mut self , file_path : & Path , language : Language , options : & ExtractionOptions ) -> Result < Vec < CodeChunk > > {
159+ Self :: extract_from_file_static ( file_path, language, options)
160+ }
161+
162+ fn extract_from_file_static ( file_path : & Path , language : Language , options : & ExtractionOptions ) -> Result < Vec < CodeChunk > > {
138163 let content = fs:: read_to_string ( file_path) ?;
139- let parser = match language {
140- Language :: Rust => & mut self . rust_parser ,
141- Language :: TypeScript => & mut self . typescript_parser ,
142- Language :: Python => & mut self . python_parser ,
143- } ;
164+ let mut parser = Self :: create_parser_for_language ( language) ?;
144165
145166 let tree = parser. parse ( & content, None )
146167 . ok_or_else ( || GitTypeError :: ExtractionFailed ( format ! ( "Failed to parse file: {:?}" , file_path) ) ) ?;
147168
148- self . extract_chunks_from_tree ( & tree, & content, file_path, language, options)
169+ Self :: extract_chunks_from_tree_static ( & tree, & content, file_path, language, options)
149170 }
150171
151172 fn extract_chunks_from_tree (
@@ -155,11 +176,21 @@ impl CodeExtractor {
155176 file_path : & Path ,
156177 language : Language ,
157178 options : & ExtractionOptions ,
179+ ) -> Result < Vec < CodeChunk > > {
180+ Self :: extract_chunks_from_tree_static ( tree, source_code, file_path, language, options)
181+ }
182+
183+ fn extract_chunks_from_tree_static (
184+ tree : & Tree ,
185+ source_code : & str ,
186+ file_path : & Path ,
187+ language : Language ,
188+ options : & ExtractionOptions ,
158189 ) -> Result < Vec < CodeChunk > > {
159190 let mut chunks = Vec :: new ( ) ;
160191
161192 // Extract comment ranges for the entire file
162- let file_comment_ranges = self . extract_comment_ranges ( tree, source_code, language. clone ( ) ) ;
193+ let file_comment_ranges = Self :: extract_comment_ranges_static ( tree, source_code, language. clone ( ) ) ;
163194
164195 let query_str = match language {
165196 Language :: Rust => "
@@ -191,7 +222,7 @@ impl CodeExtractor {
191222 let node = capture. node ;
192223 let capture_name = & query. capture_names ( ) [ capture. index as usize ] ;
193224
194- if let Some ( chunk) = self . node_to_chunk ( node, source_code, file_path, language. clone ( ) , & capture_name, options, & file_comment_ranges) {
225+ if let Some ( chunk) = Self :: node_to_chunk_static ( node, source_code, file_path, language. clone ( ) , & capture_name, options, & file_comment_ranges) {
195226 chunks. push ( chunk) ;
196227 }
197228 }
@@ -209,6 +240,18 @@ impl CodeExtractor {
209240 capture_name : & str ,
210241 options : & ExtractionOptions ,
211242 file_comment_ranges : & [ ( usize , usize ) ] ,
243+ ) -> Option < CodeChunk > {
244+ Self :: node_to_chunk_static ( node, source_code, file_path, language, capture_name, options, file_comment_ranges)
245+ }
246+
247+ fn node_to_chunk_static (
248+ node : Node ,
249+ source_code : & str ,
250+ file_path : & Path ,
251+ language : Language ,
252+ capture_name : & str ,
253+ options : & ExtractionOptions ,
254+ file_comment_ranges : & [ ( usize , usize ) ] ,
212255 ) -> Option < CodeChunk > {
213256 let start_byte = node. start_byte ( ) ;
214257 let end_byte = node. end_byte ( ) ;
@@ -234,7 +277,7 @@ impl CodeExtractor {
234277 _ => return None ,
235278 } ;
236279
237- let name = self . extract_name ( node, source_code) . unwrap_or_else ( || "unknown" . to_string ( ) ) ;
280+ let name = Self :: extract_name_static ( node, source_code) . unwrap_or_else ( || "unknown" . to_string ( ) ) ;
238281
239282 // Filter comment ranges that are within this chunk and make them relative to chunk content
240283 let chunk_comment_ranges: Vec < ( usize , usize ) > = file_comment_ranges. iter ( )
@@ -249,7 +292,7 @@ impl CodeExtractor {
249292 . collect ( ) ;
250293
251294 // Normalize indentation based on AST node position
252- let ( normalized_content, normalized_comment_ranges) = self . normalize_indentation (
295+ let ( normalized_content, normalized_comment_ranges) = Self :: normalize_indentation_static (
253296 content,
254297 original_indentation,
255298 & chunk_comment_ranges
@@ -269,6 +312,10 @@ impl CodeExtractor {
269312 }
270313
271314 fn extract_name ( & self , node : Node , source_code : & str ) -> Option < String > {
315+ Self :: extract_name_static ( node, source_code)
316+ }
317+
318+ fn extract_name_static ( node : Node , source_code : & str ) -> Option < String > {
272319 // For variable_declarator, we need to get the name from the first child
273320 if node. kind ( ) == "variable_declarator" {
274321 let mut cursor = node. walk ( ) ;
@@ -301,6 +348,10 @@ impl CodeExtractor {
301348 }
302349
303350 fn normalize_indentation ( & self , content : & str , original_indentation : usize , comment_ranges : & [ ( usize , usize ) ] ) -> ( String , Vec < ( usize , usize ) > ) {
351+ Self :: normalize_indentation_static ( content, original_indentation, comment_ranges)
352+ }
353+
354+ fn normalize_indentation_static ( content : & str , original_indentation : usize , comment_ranges : & [ ( usize , usize ) ] ) -> ( String , Vec < ( usize , usize ) > ) {
304355 let lines: Vec < & str > = content. lines ( ) . collect ( ) ;
305356 if lines. is_empty ( ) {
306357 return ( content. to_string ( ) , comment_ranges. to_vec ( ) ) ;
@@ -396,6 +447,10 @@ impl CodeExtractor {
396447 }
397448
398449 fn extract_comment_ranges ( & self , tree : & Tree , source_code : & str , language : Language ) -> Vec < ( usize , usize ) > {
450+ Self :: extract_comment_ranges_static ( tree, source_code, language)
451+ }
452+
453+ fn extract_comment_ranges_static ( tree : & Tree , source_code : & str , language : Language ) -> Vec < ( usize , usize ) > {
399454 let mut comment_ranges = Vec :: new ( ) ;
400455
401456 let comment_query = match language {
0 commit comments