@@ -332,6 +332,9 @@ func (component *WebScraper) Initialization(ctx context.Context, input map[strin
332
332
chromedp .Flag ("headless" , true ), // debug使用 false
333
333
chromedp .UserAgent (component .UserAgent ),
334
334
chromedp .Flag ("blink-settings" , "imagesEnabled=false" ),
335
+ chromedp .Flag ("ignore-certificate-errors" , true ), // 忽略SSL证书错误[1](@ref)
336
+ chromedp .Flag ("disable-web-security" , true ), // 禁用同源策略限制[1](@ref)
337
+ chromedp .Flag ("disable-hang-monitor" , true ), // 禁用页面无响应检测[1](@ref)
335
338
}
336
339
//初始化参数,先传一个空的数据
337
340
component .chromedpOptions = append (chromedp .DefaultExecAllocatorOptions [:], component .chromedpOptions ... )
@@ -379,24 +382,31 @@ func (component *WebScraper) FetchPage(ctx context.Context, document *Document,
379
382
qsLen := len (component .QuerySelector )
380
383
hcs := make ([]string , qsLen )
381
384
hrefs := make ([][]string , qsLen )
382
- actions := make ([]chromedp.Action , 0 )
383
- actions = append (actions , chromedp .Navigate (webURL ))
384
385
385
386
// 双重等待机制
386
- actions = append (actions , chromedp .WaitReady ("body" , chromedp .ByQuery )) // 等待body标签存在
387
- actions = append (actions , chromedp .Sleep (2 * time .Second )) // 容错性等待
388
- for i := 0 ; i < qsLen ; i ++ {
389
- action := chromedp .OuterHTML (component .QuerySelector [i ], & hcs [i ], chromedp .ByQuery )
390
- actions = append (actions , action )
391
- if component .Depth > 1 {
392
- hrefAction := chromedp .Evaluate (fmt .Sprintf ("Array.from(document.querySelector('%s').querySelectorAll('a')).map(a => a.href)" , component .QuerySelector [i ]), & hrefs [i ])
393
- actions = append (actions , hrefAction )
394
- }
387
+ bodyReady := chromedp .WaitReady ("body" , chromedp .ByQuery ) // 等待body标签存在
388
+ sleepReady := chromedp .Sleep (2 * time .Second ) // 容错性等待
389
+
390
+ // 自定义处理逻辑,忽略页面错误
391
+ actionFunc := chromedp .ActionFunc (func (ctx context.Context ) error {
392
+ for i := 0 ; i < qsLen ; i ++ {
393
+ //获取网页的内容,chromedp.AtLeast(0)立即执行,不要等待
394
+ err := chromedp .OuterHTML (component .QuerySelector [i ], & hcs [i ], chromedp .ByQuery , chromedp .AtLeast (0 )).Do (ctx )
395
+ if err != nil {
396
+ continue
397
+ }
398
+ // 获取页面的超链接
399
+ if component .Depth > 1 {
400
+ chromedp .Evaluate (fmt .Sprintf ("Array.from(document.querySelector('%s').querySelectorAll('a')).map(a => a.href)" , component .QuerySelector [i ]), & hrefs [i ]).Do (ctx )
401
+ }
395
402
396
- }
403
+ }
404
+ return nil
405
+ })
397
406
// 获取网页的title,放到最后再执行
398
- actions = append (actions , chromedp .Title (& title ))
399
- err := chromedp .Run (chromeCtx , actions ... )
407
+ titleAction := chromedp .Title (& title )
408
+ //执行动作
409
+ err := chromedp .Run (chromeCtx , chromedp .Navigate (webURL ), bodyReady , sleepReady , actionFunc , titleAction )
400
410
if err != nil {
401
411
return nil , err
402
412
}
0 commit comments