Skip to content

Commit 20b3c81

Browse files
feat: extract all website-cleaning into the new code-based website-cleaner tool (#268)
* chore: knowledge linter complaints * change: make website data-source tool output unmodified html * chore: knowledge - default llm gpt-4o * feat: add website-cleaner code-based tool --------- Co-authored-by: Grant Linville <[email protected]>
1 parent 4b7b0d7 commit 20b3c81

File tree

12 files changed

+451
-72
lines changed

12 files changed

+451
-72
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
**/venv
55
**/bin
66
**/node_modules
7-
*__pycache__
7+
*__pycache__
8+
**/vendor

knowledge/data-sources/website/README.md

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,4 @@ provide a .metadata.json file to specify the website to be scraped and the local
2020
"urls": ["https://coral.org"]
2121
}
2222
}
23-
```
24-
25-
### Mode
26-
27-
There are two modes to run the tool, `colly` and `firecrawl`.
28-
29-
```bash
30-
MODE=colly go run main.go
31-
```
32-
33-
```bash
34-
MODE=firecrawl go run main.go
35-
```
23+
```

knowledge/data-sources/website/colly.go

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,17 @@ import (
1414
"strings"
1515
"time"
1616

17-
md "github.com/JohannesKaufmann/html-to-markdown/v2/converter"
18-
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
19-
"github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
2017
"github.com/gocolly/colly"
2118
"github.com/gptscript-ai/go-gptscript"
2219
"github.com/sirupsen/logrus"
2320
)
2421

2522
func crawlColly(ctx context.Context, input *MetadataInput, output *MetadataOutput, logOut *logrus.Logger, gptscript *gptscript.GPTScript) error {
26-
converter := md.NewConverter(md.WithPlugins(base.NewBasePlugin(), commonmark.NewCommonmarkPlugin()))
2723
visited := make(map[string]struct{})
2824
folders := make(map[string]struct{})
2925

3026
for _, url := range input.WebsiteCrawlingConfig.URLs {
31-
if err := scrape(ctx, converter, logOut, output, gptscript, visited, folders, url, input.Limit); err != nil {
27+
if err := scrape(ctx, logOut, output, gptscript, visited, folders, url, input.Limit); err != nil {
3228
return fmt.Errorf("failed to scrape %s: %w", url, err)
3329
}
3430
}
@@ -47,19 +43,14 @@ func crawlColly(ctx context.Context, input *MetadataInput, output *MetadataOutpu
4743
return writeMetadata(ctx, output, gptscript)
4844
}
4945

50-
func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger, output *MetadataOutput, gptscriptClient *gptscript.GPTScript, visited map[string]struct{}, folders map[string]struct{}, url string, limit int) error {
46+
func scrape(ctx context.Context, logOut *logrus.Logger, output *MetadataOutput, gptscriptClient *gptscript.GPTScript, visited map[string]struct{}, folders map[string]struct{}, url string, limit int) error {
5147
collector := colly.NewCollector()
5248
collector.OnHTML("body", func(e *colly.HTMLElement) {
5349
html, err := e.DOM.Html()
5450
if err != nil {
5551
logOut.Errorf("Failed to grab HTML: %v", err)
5652
return
5753
}
58-
markdown, err := converter.ConvertString(html)
59-
if err != nil {
60-
logOut.Errorf("Failed to convert HTML to markdown: %v", err)
61-
return
62-
}
6354
hostname := e.Request.URL.Hostname()
6455
urlPathWithQuery := e.Request.URL.Path
6556
if e.Request.URL.RawQuery != "" {
@@ -68,14 +59,14 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
6859

6960
var filePath string
7061
if urlPathWithQuery == "" {
71-
filePath = path.Join(hostname, "index.md")
62+
filePath = path.Join(hostname, "index.html")
7263
} else {
7364
trimmedPath := strings.Trim(urlPathWithQuery, "/")
7465
if trimmedPath == "" {
75-
filePath = path.Join(hostname, "index.md")
66+
filePath = path.Join(hostname, "index.html")
7667
} else {
7768
segments := strings.Split(trimmedPath, "/")
78-
fileName := segments[len(segments)-1] + ".md"
69+
fileName := segments[len(segments)-1] + ".html"
7970
filePath = path.Join(hostname, strings.Join(segments[:len(segments)-1], "/"), fileName)
8071
}
8172
}
@@ -113,7 +104,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
113104
return
114105
}
115106

116-
checksum, err := getChecksum([]byte(markdown))
107+
checksum, err := getChecksum([]byte(html))
117108
if err != nil {
118109
logOut.Errorf("Failed to get checksum for %s: %v", e.Request.URL.String(), err)
119110
return
@@ -124,7 +115,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
124115
return
125116
}
126117

127-
if err := gptscriptClient.WriteFileInWorkspace(ctx, filePath, []byte(markdown)); err != nil {
118+
if err := gptscriptClient.WriteFileInWorkspace(ctx, filePath, []byte(html)); err != nil {
128119
logOut.Errorf("Failed to write file %s: %v", filePath, err)
129120
return
130121
}
@@ -136,7 +127,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
136127
URL: e.Request.URL.String(),
137128
UpdatedAt: updatedAt,
138129
Checksum: checksum,
139-
SizeInBytes: int64(len([]byte(markdown))),
130+
SizeInBytes: int64(len([]byte(html))),
140131
}
141132

142133
folders[hostname] = struct{}{}
@@ -173,7 +164,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
173164
return
174165
}
175166

176-
// if linkURL has absolute path and it doesn't match baseURL, skip
167+
// if linkURL has absolute path, and it doesn't match baseURL, skip
177168
if strings.HasPrefix(linkURL.Path, "/") && !strings.HasPrefix(linkURL.Path, baseURL.Path) {
178169
return
179170
}

knowledge/data-sources/website/go.mod

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,13 @@ go 1.23.1
66
toolchain go1.23.2
77

88
require (
9-
github.com/JohannesKaufmann/html-to-markdown/v2 v2.2.1
10-
github.com/PuerkitoBio/goquery v1.9.2
119
github.com/gocolly/colly v1.2.0
1210
github.com/gptscript-ai/go-gptscript v0.9.6-0.20241023195750-c09e0f56b39b
1311
github.com/sirupsen/logrus v1.9.3
1412
)
1513

1614
require (
17-
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 // indirect
15+
github.com/PuerkitoBio/goquery v1.9.2 // indirect
1816
github.com/andybalholm/cascadia v1.3.2 // indirect
1917
github.com/antchfx/htmlquery v1.3.2 // indirect
2018
github.com/antchfx/xmlquery v1.4.1 // indirect
@@ -33,7 +31,6 @@ require (
3331
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
3432
github.com/perimeterx/marshmallow v1.1.5 // indirect
3533
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
36-
github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect
3734
github.com/stretchr/testify v1.9.0 // indirect
3835
github.com/temoto/robotstxt v1.1.2 // indirect
3936
golang.org/x/net v0.31.0 // indirect

knowledge/data-sources/website/go.sum

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 h1:TDlO/A2QqlNhdvH+hDnu8cv1rouhfHgLwhGzJeHGgFQ=
2-
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364/go.mod h1:U+fBZLZTYiZCOwQUT04V3J4I+0TxyLNnj0R8nBlO4fk=
3-
github.com/JohannesKaufmann/html-to-markdown/v2 v2.2.1 h1:CTdlXnVjuOA8nh2NRjPx2hZvrSirvqWmgMfYSsgh3+8=
4-
github.com/JohannesKaufmann/html-to-markdown/v2 v2.2.1/go.mod h1:/4SMA6sya4rFx35o6hHFhK47vKunlKqrw1anAVsihGQ=
51
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
62
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
73
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
@@ -43,11 +39,8 @@ github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8Hm
4339
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
4440
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
4541
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
46-
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
4742
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
4843
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
49-
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
50-
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
5144
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
5245
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
5346
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
@@ -62,15 +55,10 @@ github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU
6255
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
6356
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
6457
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
65-
github.com/sebdah/goldie/v2 v2.5.5 h1:rx1mwF95RxZ3/83sdS4Yp7t2C5TCokvWP4TBRbAyEWY=
66-
github.com/sebdah/goldie/v2 v2.5.5/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
67-
github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8=
68-
github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
6958
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
7059
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
7160
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
7261
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
73-
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
7462
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
7563
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
7664
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
@@ -79,8 +67,6 @@ github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr
7967
github.com/ugorji/go/codec v1.2.7 h1:YPXUKf7fYbp/y8xloBqZOw2qaVggbfwMlI8WM3wZUJ0=
8068
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
8169
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
82-
github.com/yuin/goldmark v1.7.8 h1:iERMLn0/QJeHFhxSt3p6PeN9mGnvIKSpG9YYorDMnic=
83-
github.com/yuin/goldmark v1.7.8/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
8470
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
8571
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
8672
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
@@ -130,11 +116,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0
130116
google.golang.org/protobuf v1.26.0 h1:bxAC2xTBsZGibn2RTntX0oH50xLsqy1OxA9tTL3p/lk=
131117
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
132118
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
133-
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
134119
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
135120
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
136-
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
137-
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
138121
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
139122
gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
140123
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=

knowledge/data-sources/website/main.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,6 @@ func main() {
8686
output.Files = make(map[string]FileDetails)
8787
}
8888

89-
mode := os.Getenv("MODE")
90-
if mode == "" {
91-
mode = "colly"
92-
}
93-
9489
if err := crawlColly(ctx, &input, &output, logErr, gptscriptClient); err != nil {
9590
logOut.WithError(fmt.Errorf("failed to crawl website: error: %w", err)).Error()
9691
os.Exit(0)

knowledge/pkg/datastore/defaults/defaults.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
const (
88
TopK int = 10
99

10-
TokenModel = "gpt-4"
10+
TokenModel = "llm"
1111
TokenEncoding = "cl100k_base"
1212
ChunkSizeTokens = 2048
1313
ChunkOverlapTokens = 256

knowledge/pkg/datastore/embeddings/openai/openai.go

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ const EmbeddingModelProviderOpenAIName string = "openai"
3131
type EmbeddingModelProviderOpenAI struct {
3232
BaseURL string `usage:"OpenAI API base" default:"https://api.openai.com/v1" env:"OPENAI_BASE_URL" koanf:"baseURL"`
3333
APIKey string `usage:"OpenAI API key (not required if used with clicky-chats)" default:"sk-foo" env:"OPENAI_API_KEY" koanf:"apiKey" mapstructure:"apiKey" export:"false"`
34-
Model string `usage:"OpenAI model" default:"gpt-4" env:"OPENAI_MODEL" koanf:"openai-model"`
34+
Model string `usage:"OpenAI model" default:"gpt-4o" env:"OPENAI_MODEL" koanf:"openai-model"`
3535
EmbeddingModel string `usage:"OpenAI Embedding model" default:"text-embedding-3-large" env:"OPENAI_EMBEDDING_MODEL" koanf:"embeddingModel" export:"required"`
3636
EmbeddingEndpoint string `usage:"OpenAI Embedding endpoint" default:"/embeddings" env:"OPENAI_EMBEDDING_ENDPOINT" koanf:"embeddingEndpoint"`
3737
APIVersion string `usage:"OpenAI API version (for Azure)" default:"2024-02-01" env:"OPENAI_API_VERSION" koanf:"apiVersion"`
@@ -42,7 +42,7 @@ type EmbeddingModelProviderOpenAI struct {
4242
type OpenAIConfig struct {
4343
BaseURL string `usage:"OpenAI API base" default:"https://api.openai.com/v1" env:"OPENAI_BASE_URL" koanf:"baseURL"`
4444
APIKey string `usage:"OpenAI API key (not required if used with clicky-chats)" default:"sk-foo" env:"OPENAI_API_KEY" koanf:"apiKey" mapstructure:"apiKey" export:"false"`
45-
Model string `usage:"OpenAI model" default:"gpt-4" env:"OPENAI_MODEL" koanf:"openai-model"`
45+
Model string `usage:"OpenAI model" default:"gpt-4o" env:"OPENAI_MODEL" koanf:"openai-model"`
4646
EmbeddingModel string `usage:"OpenAI Embedding model" default:"text-embedding-3-large" env:"OPENAI_EMBEDDING_MODEL" koanf:"embeddingModel" export:"required"`
4747
EmbeddingEndpoint string `usage:"OpenAI Embedding endpoint" default:"/embeddings" env:"OPENAI_EMBEDDING_ENDPOINT" koanf:"embeddingEndpoint"`
4848
APIVersion string `usage:"OpenAI API version (for Azure)" default:"2024-02-01" env:"OPENAI_API_VERSION" koanf:"apiVersion"`
@@ -101,7 +101,7 @@ func (p *EmbeddingModelProviderOpenAI) fillDefaults() error {
101101
defaultConfig := EmbeddingModelProviderOpenAI{
102102
BaseURL: "https://api.openai.com/v1",
103103
APIKey: "sk-foo",
104-
Model: "gpt-4",
104+
Model: "gpt-4o",
105105
EmbeddingModel: "text-embedding-3-large",
106106
EmbeddingEndpoint: "/embeddings",
107107
APIVersion: "2024-02-01",
@@ -316,18 +316,17 @@ func RequestWithExponentialBackoff(ctx context.Context, client *http.Client, req
316316

317317
resp, err = client.Do(req)
318318
if err == nil && resp.StatusCode == http.StatusOK {
319-
defer resp.Body.Close()
320-
321319
body, err := io.ReadAll(resp.Body)
322320
if err != nil {
323321
// Log the error and retry for transient error reading response body
324322
msg := fmt.Sprintf("#%d/%d: failed to read response body: %v", i+1, maxRetries, err)
325323
logger.Warn("Request failed - Retryable", "error", msg)
326324
failures = append(failures, msg)
325+
_ = resp.Body.Close()
327326
continue
328327
}
329328

330-
return body, nil
329+
return body, resp.Body.Close()
331330
}
332331

333332
if resp != nil {
@@ -337,7 +336,7 @@ func RequestWithExponentialBackoff(ctx context.Context, client *http.Client, req
337336
if rerr == nil {
338337
bodystr = string(body)
339338
}
340-
resp.Body.Close()
339+
_ = resp.Body.Close()
341340
}
342341

343342
msg := fmt.Sprintf("#%d/%d: %d <%s> (err: %v)", i+1, maxRetries, resp.StatusCode, bodystr, err)
@@ -353,7 +352,7 @@ func RequestWithExponentialBackoff(ctx context.Context, client *http.Client, req
353352
time.Sleep(delay + jitter)
354353
continue
355354
} else {
356-
// Non-retriable error
355+
// Non-retryable error
357356
logger.Error("Request failed - Non-retryable", "error", msg)
358357
break
359358
}
@@ -365,9 +364,9 @@ func RequestWithExponentialBackoff(ctx context.Context, client *http.Client, req
365364
}
366365
}
367366

368-
logger.Error("request retry limit exceeded or failed with non-retriable error(s)", "request", req)
367+
logger.Error("request retry limit exceeded or failed with non-retryable error(s)", "request", req)
369368

370-
return nil, fmt.Errorf("retry limit (%d) exceeded or failed with non-retriable error(s): %v", maxRetries, strings.Join(failures, "; "))
369+
return nil, fmt.Errorf("retry limit (%d) exceeded or failed with non-retryable error(s): %v", maxRetries, strings.Join(failures, "; "))
371370
}
372371

373372
type OpenAICompatConfig struct {

website-cleaner/go.mod

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
module github.com/otto8-ai/tools/website-cleaner
2+
3+
go 1.23.3
4+
5+
require (
6+
github.com/JohannesKaufmann/html-to-markdown/v2 v2.2.1
7+
github.com/PuerkitoBio/goquery v1.10.0
8+
github.com/gptscript-ai/go-gptscript v0.9.6-0.20241122154251-2ddfb8e12f34
9+
github.com/sirupsen/logrus v1.9.3
10+
)
11+
12+
require (
13+
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 // indirect
14+
github.com/andybalholm/cascadia v1.3.2 // indirect
15+
github.com/getkin/kin-openapi v0.128.0 // indirect
16+
github.com/go-openapi/jsonpointer v0.21.0 // indirect
17+
github.com/go-openapi/swag v0.23.0 // indirect
18+
github.com/invopop/yaml v0.3.1 // indirect
19+
github.com/josharian/intern v1.0.0 // indirect
20+
github.com/mailru/easyjson v0.7.7 // indirect
21+
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
22+
github.com/perimeterx/marshmallow v1.1.5 // indirect
23+
golang.org/x/net v0.31.0 // indirect
24+
golang.org/x/sys v0.27.0 // indirect
25+
gopkg.in/yaml.v3 v3.0.1 // indirect
26+
)

0 commit comments

Comments
 (0)