TimeNet/data_processing/data_crawling/crawl_service.py at main · trungviet17/TimeNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from omegaconf import DictConfig
import hydra
import sys, os
from queue import Queue
from scrapegraphai.graphs import SmartScraperGraph, SearchGraph
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from config.envconfig import envconfig
from utils.llm import LLM
from prompt.web_scrape import get_web_search_prompt, get_detail_content_prompt
from utils.output_schema import EventList, DetailEventList
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CrawlService:

    def __init__(self, cfg: DictConfig):
        self.cfg = cfg
        self.job_queue = Queue()
        self.llm = LLM.get_gpt_model()
        self.embedding = LLM.get_gpt_embedding_model()
        self.url = cfg.urls


    def content_search(self, query: str, is_detail: bool = False) -> dict:
        """
        Search content based on a query string.
        """
        if is_detail:
            prompt = get_detail_content_prompt(query)
        else:
            prompt = get_web_search_prompt(query)

        search_graph = SearchGraph(
            prompt = prompt ,
            schema = EventList if not is_detail else DetailEventList,
            config = {
                "llm" : {
                    "model_instance": self.llm,
                    "model_tokens": 4096,

                }
            }
        )

        output =  search_graph.run()

        return output


    def web_scrape_by_url(self, url: str, query: str = None) -> dict:
        """
        Scrape web content from a specific URL based on a query string.

        """
        if query is None :
            query = "Extract all time related information from this page, include the name, time, location, and description of the event."


        web_graph = SmartScraperGraph(
            prompt = query,
            source = url,
            config = {
                "llm" : {
                    "model_instance": self.llm,
                    "model_tokens": 4096,
                },
                "embedding": {
                    "model_instance": self.embedding,
                    "model_tokens": 4096,
                }
            },
            schema = EventList
        )


        output = web_graph.run()
        return output


    def batch_web_scrape(self, urls: list, query: str = None) -> list[dict]:
        """
        Scrape web content from a list of URLs based on a query string.

        """
        if query is None:
            query = "Extract all time related information from this page, include the name, time, location, and description of the event."

        results = []
        for url in tqdm(urls, desc="Scraping URLs"):
            result = self.web_scrape_by_url(url, query)
            results.append(result)

        return results


if __name__ == "__main__":
    from pathlib import Path


    current_dir = Path(__file__).parent.parent.parent

    config_path = str(current_dir / "config")

    @hydra.main(config_path=config_path, config_name="crawl_config", version_base="1.1")
    def crawling_data(cfg: DictConfig):
        crawl_service = CrawlService(cfg)

        logger.info("Starting web scraping with the following URLs: %s", crawl_service.url)

        results = crawl_service.batch_web_scrape(crawl_service.url)
        for result in results:
            logger.info(f"Scraped data with result {result}")

    crawling_data()