-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl_service.py
More file actions
122 lines (87 loc) · 3.42 KB
/
crawl_service.py
File metadata and controls
122 lines (87 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from omegaconf import DictConfig
import hydra
import sys, os
from queue import Queue
from scrapegraphai.graphs import SmartScraperGraph, SearchGraph
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from config.envconfig import envconfig
from utils.llm import LLM
from prompt.web_scrape import get_web_search_prompt, get_detail_content_prompt
from utils.output_schema import EventList, DetailEventList
import logging
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class CrawlService:
def __init__(self, cfg: DictConfig):
self.cfg = cfg
self.job_queue = Queue()
self.llm = LLM.get_gpt_model()
self.embedding = LLM.get_gpt_embedding_model()
self.url = cfg.urls
def content_search(self, query: str, is_detail: bool = False) -> dict:
"""
Search content based on a query string.
"""
if is_detail:
prompt = get_detail_content_prompt(query)
else:
prompt = get_web_search_prompt(query)
search_graph = SearchGraph(
prompt = prompt ,
schema = EventList if not is_detail else DetailEventList,
config = {
"llm" : {
"model_instance": self.llm,
"model_tokens": 4096,
}
}
)
output = search_graph.run()
return output
def web_scrape_by_url(self, url: str, query: str = None) -> dict:
"""
Scrape web content from a specific URL based on a query string.
"""
if query is None :
query = "Extract all time related information from this page, include the name, time, location, and description of the event."
web_graph = SmartScraperGraph(
prompt = query,
source = url,
config = {
"llm" : {
"model_instance": self.llm,
"model_tokens": 4096,
},
"embedding": {
"model_instance": self.embedding,
"model_tokens": 4096,
}
},
schema = EventList
)
output = web_graph.run()
return output
def batch_web_scrape(self, urls: list, query: str = None) -> list[dict]:
"""
Scrape web content from a list of URLs based on a query string.
"""
if query is None:
query = "Extract all time related information from this page, include the name, time, location, and description of the event."
results = []
for url in tqdm(urls, desc="Scraping URLs"):
result = self.web_scrape_by_url(url, query)
results.append(result)
return results
if __name__ == "__main__":
from pathlib import Path
current_dir = Path(__file__).parent.parent.parent
config_path = str(current_dir / "config")
@hydra.main(config_path=config_path, config_name="crawl_config", version_base="1.1")
def crawling_data(cfg: DictConfig):
crawl_service = CrawlService(cfg)
logger.info("Starting web scraping with the following URLs: %s", crawl_service.url)
results = crawl_service.batch_web_scrape(crawl_service.url)
for result in results:
logger.info(f"Scraped data with result {result}")
crawling_data()