diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e65de23 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# maven +chatbotv1/target/ + +# py +__pycache__/ +*.pyc + +# vim +*.swp +*.swo + +# idea +*.iml +*.ipr +*.iws +.idea/ + +# tmp files +subtitle/subtitle_crawler/result/ \ No newline at end of file diff --git a/subtitle/README.md b/subtitle/README.md new file mode 100644 index 0000000..2aeb52a --- /dev/null +++ b/subtitle/README.md @@ -0,0 +1,16 @@ +# 爬取字幕语料 + +* 依赖curl +* 不支持 Windows +* Tested with py2.7 + +## Install +``` +pip install scrapy +``` + +## Download Data +``` +cd ChatBotCourse/subtitle/subtitle_crawler +scrapy crawl subtitle +``` \ No newline at end of file diff --git a/subtitle/subtitle_crawler/__init__.pyc b/subtitle/subtitle_crawler/__init__.pyc deleted file mode 100644 index fbf82db..0000000 Binary files a/subtitle/subtitle_crawler/__init__.pyc and /dev/null differ diff --git a/subtitle/subtitle_crawler/items.pyc b/subtitle/subtitle_crawler/items.pyc deleted file mode 100644 index ca8dc59..0000000 Binary files a/subtitle/subtitle_crawler/items.pyc and /dev/null differ diff --git a/subtitle/subtitle_crawler/pipelines.py b/subtitle/subtitle_crawler/pipelines.py index 9577466..8faf29c 100644 --- a/subtitle/subtitle_crawler/pipelines.py +++ b/subtitle/subtitle_crawler/pipelines.py @@ -4,14 +4,17 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import os +from subprocess import call +DOWNLOAD_CMD = os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'save.sh') class SubtitleCrawlerPipeline(object): + ''' + Download Subtitle with curl + ''' def process_item(self, item, spider): url = item['url'] - file_name = url.replace('/','_').replace(':','_') - fp = open('result/'+file_name, 'w') - fp.write(item['body']) - fp.close() + call([DOWNLOAD_CMD, url]) return item - diff --git a/subtitle/subtitle_crawler/pipelines.pyc b/subtitle/subtitle_crawler/pipelines.pyc deleted file mode 100644 index 097899c..0000000 Binary files a/subtitle/subtitle_crawler/pipelines.pyc and /dev/null differ diff --git a/subtitle/subtitle_crawler/save.sh b/subtitle/subtitle_crawler/save.sh new file mode 100755 index 0000000..3bda9d1 --- /dev/null +++ b/subtitle/subtitle_crawler/save.sh @@ -0,0 +1,19 @@ +#! /bin/bash +########################################### +# Save file with CURL +########################################### + +# constants +baseDir=$(cd `dirname "$0"`;pwd) +SAVE_PATH=$baseDir/result +# functions + +# main +[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return +if [ ! -d "$SAVE_PATH" ]; then + # Control will enter here if $DIRECTORY doesn't exist. + mkdir -p $SAVE_PATH +fi + +cd $SAVE_PATH +curl -JLO $1 diff --git a/subtitle/subtitle_crawler/settings.pyc b/subtitle/subtitle_crawler/settings.pyc deleted file mode 100644 index 234e50e..0000000 Binary files a/subtitle/subtitle_crawler/settings.pyc and /dev/null differ diff --git a/subtitle/subtitle_crawler/spiders/__init__.pyc b/subtitle/subtitle_crawler/spiders/__init__.pyc deleted file mode 100644 index 4bb75bc..0000000 Binary files a/subtitle/subtitle_crawler/spiders/__init__.pyc and /dev/null differ diff --git a/subtitle/subtitle_crawler/spiders/subtitle_spider.pyc b/subtitle/subtitle_crawler/spiders/subtitle_spider.pyc deleted file mode 100644 index 006f965..0000000 Binary files a/subtitle/subtitle_crawler/spiders/subtitle_spider.pyc and /dev/null differ