@@ -14,21 +14,17 @@ import (
1414 "strings"
1515 "time"
1616
17- md "github.com/JohannesKaufmann/html-to-markdown/v2/converter"
18- "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/base"
19- "github.com/JohannesKaufmann/html-to-markdown/v2/plugin/commonmark"
2017 "github.com/gocolly/colly"
2118 "github.com/gptscript-ai/go-gptscript"
2219 "github.com/sirupsen/logrus"
2320)
2421
2522func crawlColly (ctx context.Context , input * MetadataInput , output * MetadataOutput , logOut * logrus.Logger , gptscript * gptscript.GPTScript ) error {
26- converter := md .NewConverter (md .WithPlugins (base .NewBasePlugin (), commonmark .NewCommonmarkPlugin ()))
2723 visited := make (map [string ]struct {})
2824 folders := make (map [string ]struct {})
2925
3026 for _ , url := range input .WebsiteCrawlingConfig .URLs {
31- if err := scrape (ctx , converter , logOut , output , gptscript , visited , folders , url , input .Limit ); err != nil {
27+ if err := scrape (ctx , logOut , output , gptscript , visited , folders , url , input .Limit ); err != nil {
3228 return fmt .Errorf ("failed to scrape %s: %w" , url , err )
3329 }
3430 }
@@ -47,19 +43,14 @@ func crawlColly(ctx context.Context, input *MetadataInput, output *MetadataOutpu
4743 return writeMetadata (ctx , output , gptscript )
4844}
4945
50- func scrape (ctx context.Context , converter * md. Converter , logOut * logrus.Logger , output * MetadataOutput , gptscriptClient * gptscript.GPTScript , visited map [string ]struct {}, folders map [string ]struct {}, url string , limit int ) error {
46+ func scrape (ctx context.Context , logOut * logrus.Logger , output * MetadataOutput , gptscriptClient * gptscript.GPTScript , visited map [string ]struct {}, folders map [string ]struct {}, url string , limit int ) error {
5147 collector := colly .NewCollector ()
5248 collector .OnHTML ("body" , func (e * colly.HTMLElement ) {
5349 html , err := e .DOM .Html ()
5450 if err != nil {
5551 logOut .Errorf ("Failed to grab HTML: %v" , err )
5652 return
5753 }
58- markdown , err := converter .ConvertString (html )
59- if err != nil {
60- logOut .Errorf ("Failed to convert HTML to markdown: %v" , err )
61- return
62- }
6354 hostname := e .Request .URL .Hostname ()
6455 urlPathWithQuery := e .Request .URL .Path
6556 if e .Request .URL .RawQuery != "" {
@@ -68,14 +59,14 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
6859
6960 var filePath string
7061 if urlPathWithQuery == "" {
71- filePath = path .Join (hostname , "index.md " )
62+ filePath = path .Join (hostname , "index.html " )
7263 } else {
7364 trimmedPath := strings .Trim (urlPathWithQuery , "/" )
7465 if trimmedPath == "" {
75- filePath = path .Join (hostname , "index.md " )
66+ filePath = path .Join (hostname , "index.html " )
7667 } else {
7768 segments := strings .Split (trimmedPath , "/" )
78- fileName := segments [len (segments )- 1 ] + ".md "
69+ fileName := segments [len (segments )- 1 ] + ".html "
7970 filePath = path .Join (hostname , strings .Join (segments [:len (segments )- 1 ], "/" ), fileName )
8071 }
8172 }
@@ -113,7 +104,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
113104 return
114105 }
115106
116- checksum , err := getChecksum ([]byte (markdown ))
107+ checksum , err := getChecksum ([]byte (html ))
117108 if err != nil {
118109 logOut .Errorf ("Failed to get checksum for %s: %v" , e .Request .URL .String (), err )
119110 return
@@ -124,7 +115,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
124115 return
125116 }
126117
127- if err := gptscriptClient .WriteFileInWorkspace (ctx , filePath , []byte (markdown )); err != nil {
118+ if err := gptscriptClient .WriteFileInWorkspace (ctx , filePath , []byte (html )); err != nil {
128119 logOut .Errorf ("Failed to write file %s: %v" , filePath , err )
129120 return
130121 }
@@ -136,7 +127,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
136127 URL : e .Request .URL .String (),
137128 UpdatedAt : updatedAt ,
138129 Checksum : checksum ,
139- SizeInBytes : int64 (len ([]byte (markdown ))),
130+ SizeInBytes : int64 (len ([]byte (html ))),
140131 }
141132
142133 folders [hostname ] = struct {}{}
@@ -173,7 +164,7 @@ func scrape(ctx context.Context, converter *md.Converter, logOut *logrus.Logger,
173164 return
174165 }
175166
176- // if linkURL has absolute path and it doesn't match baseURL, skip
167+ // if linkURL has absolute path, and it doesn't match baseURL, skip
177168 if strings .HasPrefix (linkURL .Path , "/" ) && ! strings .HasPrefix (linkURL .Path , baseURL .Path ) {
178169 return
179170 }
0 commit comments