Skip to content

Commit 4ccadfa

Browse files
committed
修复爬虫等待超时
1 parent 51793e1 commit 4ccadfa

File tree

2 files changed

+31
-18
lines changed

2 files changed

+31
-18
lines changed

componentOpenAI.go

+24-14
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,9 @@ func (component *WebScraper) Initialization(ctx context.Context, input map[strin
332332
chromedp.Flag("headless", true), // debug使用 false
333333
chromedp.UserAgent(component.UserAgent),
334334
chromedp.Flag("blink-settings", "imagesEnabled=false"),
335+
chromedp.Flag("ignore-certificate-errors", true), // 忽略SSL证书错误[1](@ref)
336+
chromedp.Flag("disable-web-security", true), // 禁用同源策略限制[1](@ref)
337+
chromedp.Flag("disable-hang-monitor", true), // 禁用页面无响应检测[1](@ref)
335338
}
336339
//初始化参数,先传一个空的数据
337340
component.chromedpOptions = append(chromedp.DefaultExecAllocatorOptions[:], component.chromedpOptions...)
@@ -379,24 +382,31 @@ func (component *WebScraper) FetchPage(ctx context.Context, document *Document,
379382
qsLen := len(component.QuerySelector)
380383
hcs := make([]string, qsLen)
381384
hrefs := make([][]string, qsLen)
382-
actions := make([]chromedp.Action, 0)
383-
actions = append(actions, chromedp.Navigate(webURL))
384385

385386
// 双重等待机制
386-
actions = append(actions, chromedp.WaitReady("body", chromedp.ByQuery)) // 等待body标签存在
387-
actions = append(actions, chromedp.Sleep(2*time.Second)) // 容错性等待
388-
for i := 0; i < qsLen; i++ {
389-
action := chromedp.OuterHTML(component.QuerySelector[i], &hcs[i], chromedp.ByQuery)
390-
actions = append(actions, action)
391-
if component.Depth > 1 {
392-
hrefAction := chromedp.Evaluate(fmt.Sprintf("Array.from(document.querySelector('%s').querySelectorAll('a')).map(a => a.href)", component.QuerySelector[i]), &hrefs[i])
393-
actions = append(actions, hrefAction)
394-
}
387+
bodyReady := chromedp.WaitReady("body", chromedp.ByQuery) // 等待body标签存在
388+
sleepReady := chromedp.Sleep(2 * time.Second) // 容错性等待
389+
390+
// 自定义处理逻辑,忽略页面错误
391+
actionFunc := chromedp.ActionFunc(func(ctx context.Context) error {
392+
for i := 0; i < qsLen; i++ {
393+
//获取网页的内容,chromedp.AtLeast(0)立即执行,不要等待
394+
err := chromedp.OuterHTML(component.QuerySelector[i], &hcs[i], chromedp.ByQuery, chromedp.AtLeast(0)).Do(ctx)
395+
if err != nil {
396+
continue
397+
}
398+
// 获取页面的超链接
399+
if component.Depth > 1 {
400+
chromedp.Evaluate(fmt.Sprintf("Array.from(document.querySelector('%s').querySelectorAll('a')).map(a => a.href)", component.QuerySelector[i]), &hrefs[i]).Do(ctx)
401+
}
395402

396-
}
403+
}
404+
return nil
405+
})
397406
// 获取网页的title,放到最后再执行
398-
actions = append(actions, chromedp.Title(&title))
399-
err := chromedp.Run(chromeCtx, actions...)
407+
titleAction := chromedp.Title(&title)
408+
//执行动作
409+
err := chromedp.Run(chromeCtx, chromedp.Navigate(webURL), bodyReady, sleepReady, actionFunc, titleAction)
400410
if err != nil {
401411
return nil, err
402412
}

routeAdmin.go

+7-4
Original file line numberDiff line numberDiff line change
@@ -1001,18 +1001,21 @@ func funcWebScraper(ctx context.Context, c *app.RequestContext) {
10011001
return
10021002
}
10031003
webScraperDocuments := make([]Document, 0)
1004-
webScraperHrefs := make(map[string]bool, 0)
1005-
webScraperHrefs[""] = true
1004+
webScraperHrefMap := make(map[string]bool, 0)
1005+
webScraperHrefMap[""] = true
10061006
now := time.Now().Format("2006-01-02 15:04:05")
10071007
go func() {
10081008
// 递归抓取网页
1009-
recursiveScraper(ctx, &webScraperDocuments, &webScraperHrefs, webScraper)
1010-
input := make(map[string]interface{}, 0)
1009+
recursiveScraper(ctx, &webScraperDocuments, &webScraperHrefMap, webScraper)
1010+
maxSortNo := funcMaxSortNo(tableDocumentName)
1011+
//循环处理所有的网页
10111012
for i := 0; i < len(webScraperDocuments); i++ {
10121013
doc := webScraperDocuments[i]
10131014
if doc.Id == "" {
10141015
continue
10151016
}
1017+
doc.SortNo = maxSortNo + i
1018+
input := make(map[string]interface{}, 0)
10161019
input["document"] = &doc
10171020
//清洗html标签
10181021
hc := &HtmlCleaner{}

0 commit comments

Comments
 (0)