Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Add visited url properly to memory store #354

Merged
merged 2 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 23 additions & 18 deletions knowledge/data-sources/website/colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,11 @@ const maxFileSize = 1024 * 1024 * 100

func crawlColly(ctx context.Context, input *MetadataInput, output *MetadataOutput, logOut *logrus.Logger, gptscript *gptscript.GPTScript) error {
visited := make(map[string]struct{})
for url := range output.State.WebsiteCrawlingState.VisitedURLs {
filePath, err := convertUrlToFilePath(url)
if err != nil {
logOut.Errorf("Failed to convert URL to file path: %v", err)
continue
}
for _, filePath := range output.State.WebsiteCrawlingState.VisitedURLs {
visited[filePath] = struct{}{}
}
for _, url := range input.WebsiteCrawlingConfig.URLs {
if err := scrape(ctx, logOut, output, gptscript, visited, url, output.State.WebsiteCrawlingState.CurrentURL, input.Limit); err != nil {
if err := scrape(ctx, logOut, output, gptscript, visited, url, input.Limit); err != nil {
return fmt.Errorf("failed to scrape %s: %w", url, err)
}
}
Expand All @@ -54,18 +49,18 @@ func crawlColly(ctx context.Context, input *MetadataInput, output *MetadataOutpu
return writeMetadata(ctx, output, gptscript)
}

func scrape(ctx context.Context, logOut *logrus.Logger, output *MetadataOutput, gptscriptClient *gptscript.GPTScript, visited map[string]struct{}, url, urlToResume string, limit int) error {
func scrape(ctx context.Context, logOut *logrus.Logger, output *MetadataOutput, gptscriptClient *gptscript.GPTScript, visited map[string]struct{}, url string, limit int) error {
collector := colly.NewCollector()

inMemoryStore := &storage.InMemoryStorage{}
inMemoryStore.Init()

for url := range visited {
if url == urlToResume {
for u := range output.State.WebsiteCrawlingState.VisitedURLs {
if u == url {
continue
}
h := fnv.New64a()
h.Write([]byte(url))
h.Write([]byte(u))
urlHash := h.Sum64()
inMemoryStore.Visited(urlHash)
}
Expand Down Expand Up @@ -149,9 +144,6 @@ func scrape(ctx context.Context, logOut *logrus.Logger, output *MetadataOutput,
SizeInBytes: int64(len(data)),
}

output.State.WebsiteCrawlingState.CurrentURL = e.Request.URL.String()
output.State.WebsiteCrawlingState.VisitedURLs[output.State.WebsiteCrawlingState.CurrentURL] = struct{}{}

output.Status = fmt.Sprintf("Scraped %v", e.Request.URL.String())
})

Expand Down Expand Up @@ -212,13 +204,25 @@ func scrape(ctx context.Context, logOut *logrus.Logger, output *MetadataOutput,
}
linkURL = parsedLink
}
e.Request.Visit(linkURL.String())

if err := e.Request.Visit(linkURL.String()); err == nil {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only marked url as visited if it has visited all the link under the url. Otherwise we could be missing url on the next restart.

filePath, err := convertUrlToFilePath(linkURL.String())
if err != nil {
logOut.Errorf("Failed to convert URL to file path: %v", err)
return
}

output.State.WebsiteCrawlingState.VisitedURLs[linkURL.String()] = filePath

if err := writeMetadata(ctx, output, gptscriptClient); err != nil {
logOut.Infof("Failed to write metadata: %v", err)
return
}
}

}
})

if urlToResume != "" {
return collector.Visit(urlToResume)
}
return collector.Visit(url)
}

Expand Down Expand Up @@ -346,6 +350,7 @@ func scrapePDF(ctx context.Context, logOut *logrus.Logger, output *MetadataOutpu
Checksum: newChecksum,
SizeInBytes: int64(len(data)),
}
output.State.WebsiteCrawlingState.VisitedURLs[linkURL.String()] = filePath

if err := writeMetadata(ctx, output, gptscript); err != nil {
return fmt.Errorf("failed to write metadata: %v", err)
Expand Down
5 changes: 2 additions & 3 deletions knowledge/data-sources/website/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ type State struct {
}

type WebsiteCrawlingState struct {
CurrentURL string `json:"currentURL"`
VisitedURLs map[string]struct{} `json:"visitedURLs"`
VisitedURLs map[string]string `json:"visitedURLs"`
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed it to a map of string and string to account for pdf file path.

}

type FileDetails struct {
Expand Down Expand Up @@ -88,7 +87,7 @@ func main() {
}

if output.State.WebsiteCrawlingState.VisitedURLs == nil {
output.State.WebsiteCrawlingState.VisitedURLs = make(map[string]struct{})
output.State.WebsiteCrawlingState.VisitedURLs = make(map[string]string)
}

if err := crawlColly(ctx, &input, &output, logErr, gptscriptClient); err != nil {
Expand Down