cs/search.go at master · boyter/cs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
// SPDX-License-Identifier: MIT

package main

import (
	"bytes"
	"context"
	"fmt"
	"io"
	"os"
	"path/filepath"
	"runtime"
	"strings"
	"sync"
	"sync/atomic"

	"github.com/boyter/cs/v3/pkg/common"
	"github.com/boyter/cs/v3/pkg/ranker"
	"github.com/boyter/cs/v3/pkg/search"
	"github.com/boyter/cs/v3/pkg/snippet"
	"github.com/boyter/gocodewalker"
	"github.com/boyter/scc/v3/processor"
)

// SearchStats holds counters readable after the search channel drains.
type SearchStats struct {
	FileCount     atomic.Int64
	TextFileCount atomic.Int64
}

// DoSearch runs the search pipeline and returns a channel of matched FileJob results
// plus stats that are populated as the search runs.
// If cache is non-nil, it will attempt to use cached file locations from a previous
// prefix query instead of walking the filesystem, and will store results for future use.
func DoSearch(ctx context.Context, cfg *Config, query string, cache *SearchCache) (<-chan *common.FileJob, *SearchStats, error) {
	out := make(chan *common.FileJob, runtime.NumCPU())
	stats := &SearchStats{}

	// Validate query character length
	if cfg.MaxQueryChars > 0 && len(query) > cfg.MaxQueryChars {
		close(out)
		return out, stats, fmt.Errorf("query too long: %d characters exceeds maximum of %d", len(query), cfg.MaxQueryChars)
	}

	// Parse query into AST
	lexer := search.NewLexer(strings.NewReader(query))
	parser := search.NewParser(lexer)
	ast, _ := parser.ParseQuery()
	if ast == nil {
		close(out)
		return out, stats, nil
	}

	// Validate query term count
	if cfg.MaxQueryTerms > 0 && search.CountAllTerms(ast) > cfg.MaxQueryTerms {
		close(out)
		return out, stats, fmt.Errorf("query too complex: %d unique terms exceeds maximum of %d. Please refine your search terms.", search.CountAllTerms(ast), cfg.MaxQueryTerms)
	}
	transformer := &search.Transformer{}
	ast, _ = transformer.TransformAST(ast)
	ast = search.PlanAST(ast)

	// Resolve language types to extensions
	if len(cfg.LanguageTypes) > 0 {
		langExts := languageExtensions(cfg.LanguageTypes)
		cfg.AllowListExtensions = append(cfg.AllowListExtensions, langExts...)
	}

	// Determine walk directory
	dir := "."
	if strings.TrimSpace(cfg.Directory) != "" {
		dir = cfg.Directory
	}
	if cfg.FindRoot {
		dir = gocodewalker.FindRepositoryRoot(dir)
	}

	// Resolve to absolute path once so downstream filepath.Abs() calls
	// (inside gitignore matching, etc.) become no-op filepath.Clean()
	// instead of issuing an os.Getwd() syscall per file.
	// Error only possible if Getwd fails, in which case dir is unchanged
	// and the walker still functions with the original relative path.
	dir, _ = filepath.Abs(dir)

	fileQueue := make(chan *gocodewalker.File, 1000)

	// Try cache hit path: feed cached file locations instead of walking
	var walkerToTerminate *gocodewalker.FileWalker
	cacheQuery := cfg.ContentFilterCachePrefix() + query
	if cache != nil {
		if cachedFiles, ok := cache.FindPrefixFiles(cfg.AllowListExtensions, cacheQuery); ok {
			go func() {
				defer close(fileQueue)
				for _, loc := range cachedFiles {
					select {
					case <-ctx.Done():
						return
					case fileQueue <- &gocodewalker.File{
						Location: loc,
						Filename: filepath.Base(loc),
					}:
					}
				}
			}()
			goto startWorkers
		}
	}

	// Set up file walker (cache miss or no cache)
	{
		walker := gocodewalker.NewParallelFileWalker([]string{dir}, fileQueue)
		walker.AllowListExtensions = cfg.AllowListExtensions
		walker.IgnoreIgnoreFile = cfg.IgnoreIgnoreFile
		walker.IgnoreGitIgnore = cfg.IgnoreGitIgnore
		walker.LocationExcludePattern = cfg.LocationExcludePattern
		walker.IncludeHidden = cfg.IncludeHidden
		walker.ExcludeDirectory = cfg.PathDenylist
		walkerToTerminate = walker

		go func() { _ = walker.Start() }()
	}

startWorkers:
	// Ensure walker is terminated on context cancellation
	searchDone := make(chan struct{})
	if walkerToTerminate != nil {
		walker := walkerToTerminate
		go func() {
			select {
			case <-ctx.Done():
				walker.Terminate()
			case <-searchDone:
			}
		}()
	}

	// Track matched file locations for cache population
	var matchedMu sync.Mutex
	var matchedLocations []string

	// Fan out workers to read and search files in parallel
	maxRead := cfg.MaxReadSizeBytes
	var wg sync.WaitGroup
	for i := 0; i < runtime.NumCPU(); i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()

			// Per-worker pooled buffer, reused across files
			var poolBuf []byte
			if v := bufPool.Get(); v != nil {
				poolBuf = v.([]byte)
			}
			if int64(len(poolBuf)) < maxRead {
				poolBuf = make([]byte, maxRead)
			}
			defer bufPool.Put(poolBuf)

			for f := range fileQueue {
				select {
				case <-ctx.Done():
					return
				default:
				}

				stats.FileCount.Add(1)

				// Read file content into pooled buffer (avoids fstat + per-file alloc)
				content, err := readFileContentBuf(f.Location, poolBuf[:maxRead])
				if err != nil || len(content) == 0 {
					continue
				}

				// Binary check: look for NUL byte in first 10KB
				if !cfg.IncludeBinaryFiles {
					check := content
					if len(check) > 10_000 {
						check = content[:10_000]
					}
					if bytes.IndexByte(check, 0) != -1 {
						continue
					}
				}

				// Minified check
				if !cfg.IncludeMinified {
					lineCount := bytes.Count(content, []byte("\n")) + 1
					avgLineLength := len(content) / lineCount
					if avgLineLength > cfg.MinifiedLineByteLength {
						continue
					}
				}

				stats.TextFileCount.Add(1)

				// Evaluate query AST against file content
				matched, matchLocations := search.EvaluateFile(ast, content, f.Filename, f.Location, cfg.CaseSensitive)
				if !matched {
					continue
				}

				// File matched — copy content out of the pooled buffer so it can
				// be safely stored in FileJob while the pool buffer is reused.
				// This must happen before post-eval filters (lang, content-type,
				// declarations) since they read content and the pool may reclaim
				// the buffer. The heavy filter (EvaluateFile) already passed, so
				// few files reach here only to be rejected by later filters.
				ownedContent := make([]byte, len(content))
				copy(ownedContent, content)
				content = ownedContent

				lang, sccLines, sccCode, sccComment, sccBlank, sccComplexity, contentByteType := fileCodeStats(f.Filename, content)

				// Post-evaluate metadata filters (lang, complexity) now that metadata is available
				if !search.PostEvalMetadataFilters(ast, lang, sccComplexity) {
					continue
				}

				// Filter match locations by content type when a filter is active
				if cfg.OnlyCode || cfg.OnlyComments || cfg.OnlyStrings {
					var survived bool
					matchLocations, survived = filterMatchLocations(matchLocations, contentByteType, cfg)
					if !survived {
						continue
					}
				}

				// Filter by declaration/usage when filter is active
				if cfg.OnlyDeclarations || cfg.OnlyUsages {
					declarations, usages := ranker.ClassifyMatchLocations(content, matchLocations, lang)

					if cfg.OnlyDeclarations {
						matchLocations = declarations
					} else {
						matchLocations = usages
					}

					anySurvived := false
					for _, locs := range matchLocations {
						if len(locs) > 0 {
							anySurvived = true
							break
						}
					}
					if !anySurvived {
						continue
					}
				}

				// Track matched file location for cache
				if cache != nil {
					matchedMu.Lock()
					matchedLocations = append(matchedLocations, f.Location)
					matchedMu.Unlock()
				}

				snippet.AddPhraseMatchLocations(content, strings.Trim(query, "\""), matchLocations)

				fj := &common.FileJob{
					Filename:        f.Filename,
					Extension:       gocodewalker.GetExtension(f.Filename),
					Location:        f.Location,
					Content:         content,
					ContentByteType: contentByteType,
					Bytes:           len(content),
					MatchLocations:  matchLocations,
					Language:        lang,
					Lines:           sccLines,
					Code:            sccCode,
					Comment:         sccComment,
					Blank:           sccBlank,
					Complexity:      sccComplexity,
				}

				select {
				case out <- fj:
				case <-ctx.Done():
					return
				}
			}
		}()
	}

	go func() {
		wg.Wait()
		close(out)
		close(searchDone)

		// Populate cache with matched file locations
		if cache != nil && len(matchedLocations) > 0 {
			cache.Store(cfg.AllowListExtensions, cacheQuery, matchedLocations)
		}
	}()

	return out, stats, nil
}

// filterMatchLocations removes match locations that don't belong to the
// content type selected by the active filter. Returns the filtered map
// and true if any locations survived. When contentByteType is nil
// (unrecognised language) and a content filter is active, the file is
// excluded because we cannot verify the content type.
func filterMatchLocations(matchLocations map[string][][]int, contentByteType []byte, cfg *Config) (map[string][][]int, bool) {
	if contentByteType == nil {
		if cfg.OnlyCode || cfg.OnlyComments || cfg.OnlyStrings {
			return nil, false
		}
		return matchLocations, len(matchLocations) > 0
	}

	var allowedTypes []byte
	switch {
	case cfg.OnlyCode:
		allowedTypes = []byte{processor.ByteTypeCode, processor.ByteTypeBlank}
	case cfg.OnlyComments:
		allowedTypes = []byte{processor.ByteTypeComment}
	case cfg.OnlyStrings:
		allowedTypes = []byte{processor.ByteTypeString}
	}

	allowed := make(map[byte]bool, len(allowedTypes))
	for _, t := range allowedTypes {
		allowed[t] = true
	}

	filtered := make(map[string][][]int, len(matchLocations))
	anySurvived := false
	for term, locs := range matchLocations {
		var kept [][]int
		for _, loc := range locs {
			if len(loc) < 2 {
				continue
			}
			startByte := loc[0]
			if startByte >= 0 && startByte < len(contentByteType) && allowed[contentByteType[startByte]] {
				kept = append(kept, loc)
			}
		}
		if len(kept) > 0 {
			filtered[term] = kept
			anySurvived = true
		}
	}
	return filtered, anySurvived
}

// bufPool holds reusable read buffers for the search worker hot path.
var bufPool sync.Pool

// readFileContentBuf reads a file into buf, limiting to len(buf) bytes.
// Returns the sub-slice of buf containing the file content.
// Eliminates the fstat syscall by reading directly into the pre-sized buffer.
func readFileContentBuf(location string, buf []byte) ([]byte, error) {
	f, err := os.Open(location)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	n, err := io.ReadFull(f, buf)
	if err != nil {
		if err == io.EOF || err == io.ErrUnexpectedEOF {
			if n == 0 {
				return nil, nil
			}
			return buf[:n], nil
		}
		return nil, err
	}
	return buf[:n], nil
}

// readFileContent reads a file, limiting to maxBytes if the file is larger.
func readFileContent(location string, maxBytes int64) ([]byte, error) {
	f, err := os.Open(location)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	fi, err := f.Stat()
	if err != nil {
		return nil, err
	}

	size := fi.Size()
	if size == 0 {
		return nil, nil
	}
	if size > maxBytes {
		size = maxBytes
	}

	buf := make([]byte, size)
	n, err := io.ReadFull(f, buf)
	if err != nil && err != io.ErrUnexpectedEOF {
		return nil, err
	}
	return buf[:n], nil
}