diff --git a/crawler/php/load.php b/crawler/php/load.php index 8e1f8f7..030bb84 100644 --- a/crawler/php/load.php +++ b/crawler/php/load.php @@ -22,5 +22,5 @@ if (preg_match('/(?<title>.*?)<\/title>/is', $content, $content_match)) $title = trim(html_entity_decode($content_match['title'])); # here we are doing a simple HTML page parsing to get <title> from that else continue; # we are not interested in pages without a title echo "{$match['path']}: $title {$match['url']} ".strlen($content)." bytes\n"; # let's say something about our progress - $manticore->query("REPLACE INTO rt (id,title,url,body) VALUES(".crc32($title).",'".$manticore->escape_string($title)."','".$manticore->escape_string($match['url'])."','".$manticore->escape_string($content)."')"); # and we are finally putting the contents to Manticore. We use crc32(title) as a document ID to avoid duplicates. + $manticore->query("REPLACE INTO rt (id,title,url,body) VALUES(".crc32($match['url']).",'".$manticore->escape_string($title)."','".$manticore->escape_string($match['url'])."','".$manticore->escape_string($content)."')"); # and we are finally putting the contents to Manticore. We use crc32($match['url']) as a document ID to avoid duplicates. } # and we are going back to the next page wget reports as downloaded