@@ -12,7 +12,9 @@ All three readers will require a personal access token (which you can generate u
12
12
13
13
## Repository Reader
14
14
15
- This reader will read through a repo, with options to specifically filter directories and file extensions.
15
+ This reader will read through a repo, with options to specifically filter directories, file extensions, file paths, and custom processing logic.
16
+
17
+ ### Basic Usage
16
18
17
19
``` python
18
20
from llama_index.readers.github import GithubRepositoryReader, GithubClient
@@ -47,6 +49,187 @@ reader = GithubRepositoryReader(
47
49
documents = reader.load_data(branch = " main" )
48
50
```
49
51
52
+ ### Advanced Filtering Options
53
+
54
+ #### Filter Specific File Paths
55
+
56
+ ``` python
57
+ # Include only specific files
58
+ reader = GithubRepositoryReader(
59
+ github_client = github_client,
60
+ owner = " run-llama" ,
61
+ repo = " llama_index" ,
62
+ filter_file_paths = (
63
+ [" README.md" , " src/main.py" , " docs/guide.md" ],
64
+ GithubRepositoryReader.FilterType.INCLUDE ,
65
+ ),
66
+ )
67
+
68
+ # Exclude specific files
69
+ reader = GithubRepositoryReader(
70
+ github_client = github_client,
71
+ owner = " run-llama" ,
72
+ repo = " llama_index" ,
73
+ filter_file_paths = (
74
+ [" tests/test_file.py" , " temp/cache.txt" ],
75
+ GithubRepositoryReader.FilterType.EXCLUDE ,
76
+ ),
77
+ )
78
+ ```
79
+
80
+ #### Custom File Processing Callback
81
+
82
+ ``` python
83
+ def process_file_callback (file_path : str , file_size : int ) -> tuple[bool , str ]:
84
+ """ Custom logic to determine if a file should be processed.
85
+
86
+ Args:
87
+ file_path: The full path to the file
88
+ file_size: The size of the file in bytes
89
+
90
+ Returns:
91
+ Tuple of (should_process: bool, reason: str)
92
+ """
93
+ # Skip large files
94
+ if file_size > 1024 * 1024 : # 1MB
95
+ return False , f " File too large: { file_size} bytes "
96
+
97
+ # Skip test files
98
+ if " test" in file_path.lower():
99
+ return False , " Skipping test files"
100
+
101
+ # Skip binary files by extension
102
+ binary_extensions = [" .exe" , " .bin" , " .so" , " .dylib" ]
103
+ if any (file_path.endswith(ext) for ext in binary_extensions):
104
+ return False , " Skipping binary files"
105
+
106
+ return True , " "
107
+
108
+
109
+ reader = GithubRepositoryReader(
110
+ github_client = github_client,
111
+ owner = " run-llama" ,
112
+ repo = " llama_index" ,
113
+ process_file_callback = process_file_callback,
114
+ fail_on_error = False , # Continue processing if callback fails
115
+ )
116
+ ```
117
+
118
+ #### Custom Folder for Temporary Files
119
+
120
+ ``` python
121
+ from llama_index.core.readers.base import BaseReader
122
+
123
+
124
+ # Custom parser for specific file types
125
+ class CustomMarkdownParser (BaseReader ):
126
+ def load_data (self , file_path , extra_info = None ):
127
+ # Custom parsing logic here
128
+ pass
129
+
130
+
131
+ reader = GithubRepositoryReader(
132
+ github_client = github_client,
133
+ owner = " run-llama" ,
134
+ repo = " llama_index" ,
135
+ use_parser = True ,
136
+ custom_parsers = {" .md" : CustomMarkdownParser()},
137
+ custom_folder = " /tmp/github_processing" , # Custom temp directory
138
+ )
139
+ ```
140
+
141
+ ### Event System Integration
142
+
143
+ The reader integrates with LlamaIndex's instrumentation system to provide detailed events during processing:
144
+
145
+ ``` python
146
+ from llama_index.core.instrumentation import get_dispatcher
147
+ from llama_index.core.instrumentation.event_handlers import BaseEventHandler
148
+ from llama_index.readers.github.repository.event import (
149
+ GitHubFileProcessedEvent,
150
+ GitHubFileSkippedEvent,
151
+ GitHubFileFailedEvent,
152
+ GitHubRepositoryProcessingStartedEvent,
153
+ GitHubRepositoryProcessingCompletedEvent,
154
+ )
155
+
156
+
157
+ class GitHubEventHandler (BaseEventHandler ):
158
+ def handle (self , event ):
159
+ if isinstance (event, GitHubRepositoryProcessingStartedEvent):
160
+ print (f " Started processing repository: { event.repository_name} " )
161
+ elif isinstance (event, GitHubFileProcessedEvent):
162
+ print (
163
+ f " Processed file: { event.file_path} ( { event.file_size} bytes) "
164
+ )
165
+ elif isinstance (event, GitHubFileSkippedEvent):
166
+ print (f " Skipped file: { event.file_path} - { event.reason} " )
167
+ elif isinstance (event, GitHubFileFailedEvent):
168
+ print (f " Failed to process file: { event.file_path} - { event.error} " )
169
+ elif isinstance (event, GitHubRepositoryProcessingCompletedEvent):
170
+ print (
171
+ f " Completed processing. Total documents: { event.total_documents} "
172
+ )
173
+
174
+
175
+ # Register the event handler
176
+ dispatcher = get_dispatcher()
177
+ handler = GitHubEventHandler()
178
+ dispatcher.add_event_handler(handler)
179
+
180
+ # Use the reader - events will be automatically dispatched
181
+ reader = GithubRepositoryReader(
182
+ github_client = github_client,
183
+ owner = " run-llama" ,
184
+ repo = " llama_index" ,
185
+ )
186
+ documents = reader.load_data(branch = " main" )
187
+ ```
188
+
189
+ #### Available Events
190
+
191
+ The following events are dispatched during repository processing:
192
+
193
+ - ** ` GitHubRepositoryProcessingStartedEvent ` ** : Fired when repository processing begins
194
+
195
+ - ` repository_name ` : Name of the repository (owner/repo)
196
+ - ` branch_or_commit ` : Branch name or commit SHA being processed
197
+
198
+ - ** ` GitHubRepositoryProcessingCompletedEvent ` ** : Fired when repository processing completes
199
+
200
+ - ` repository_name ` : Name of the repository
201
+ - ` branch_or_commit ` : Branch name or commit SHA
202
+ - ` total_documents ` : Number of documents created
203
+
204
+ - ** ` GitHubTotalFilesToProcessEvent ` ** : Fired with the total count of files to be processed
205
+
206
+ - ` repository_name ` : Name of the repository
207
+ - ` branch_or_commit ` : Branch name or commit SHA
208
+ - ` total_files ` : Total number of files found
209
+
210
+ - ** ` GitHubFileProcessingStartedEvent ` ** : Fired when individual file processing starts
211
+
212
+ - ` file_path ` : Path to the file being processed
213
+ - ` file_type ` : File extension
214
+
215
+ - ** ` GitHubFileProcessedEvent ` ** : Fired when a file is successfully processed
216
+
217
+ - ` file_path ` : Path to the processed file
218
+ - ` file_type ` : File extension
219
+ - ` file_size ` : Size of the file in bytes
220
+ - ` document ` : The created Document object
221
+
222
+ - ** ` GitHubFileSkippedEvent ` ** : Fired when a file is skipped
223
+
224
+ - ` file_path ` : Path to the skipped file
225
+ - ` file_type ` : File extension
226
+ - ` reason ` : Reason why the file was skipped
227
+
228
+ - ** ` GitHubFileFailedEvent ` ** : Fired when file processing fails
229
+ - ` file_path ` : Path to the failed file
230
+ - ` file_type ` : File extension
231
+ - ` error ` : Error message describing the failure
232
+
50
233
## Issues Reader
51
234
52
235
``` python
0 commit comments