|
| 1 | +import requests |
| 2 | +import re |
| 3 | +import json |
| 4 | +import os |
| 5 | + |
| 6 | +session = requests.session() |
| 7 | + |
| 8 | + |
| 9 | +def fetch_url(url): |
| 10 | + return session.get(url).content.decode('gbk') |
| 11 | + |
| 12 | + |
| 13 | +def get_doc_id(url): |
| 14 | + return re.findall('view/(.*).html', url)[0] |
| 15 | + |
| 16 | + |
| 17 | +def parse_type(content): |
| 18 | + return re.findall(r"docType.*?\:.*?\'(.*?)\'\,", content)[0] |
| 19 | + |
| 20 | + |
| 21 | +def parse_title(content): |
| 22 | + return re.findall(r"title.*?\:.*?\'(.*?)\'\,", content)[0] |
| 23 | + |
| 24 | + |
| 25 | +def parse_doc(content): |
| 26 | + result = '' |
| 27 | + url_list = re.findall('(https.*?0.json.*?)\\\\x22}', content) |
| 28 | + url_list = [addr.replace("\\\\\\/", "/") for addr in url_list] |
| 29 | + for url in url_list[:-5]: |
| 30 | + content = fetch_url(url) |
| 31 | + y = 0 |
| 32 | + txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),', content) |
| 33 | + for item in txtlists: |
| 34 | + if not y == item[1]: |
| 35 | + y = item[1] |
| 36 | + n = '\n' |
| 37 | + else: |
| 38 | + n = '' |
| 39 | + result += n |
| 40 | + result += item[0].encode('utf-8').decode('unicode_escape', 'ignore') |
| 41 | + return result |
| 42 | + |
| 43 | + |
| 44 | +def parse_txt(doc_id): |
| 45 | + content_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=' + doc_id |
| 46 | + content = fetch_url(content_url) |
| 47 | + md5 = re.findall('"md5sum":"(.*?)"', content)[0] |
| 48 | + pn = re.findall('"totalPageNum":"(.*?)"', content)[0] |
| 49 | + rsign = re.findall('"rsign":"(.*?)"', content)[0] |
| 50 | + content_url = 'https://wkretype.bdimg.com/retype/text/' + doc_id + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign |
| 51 | + content = json.loads(fetch_url(content_url)) |
| 52 | + result = '' |
| 53 | + for item in content: |
| 54 | + for i in item['parags']: |
| 55 | + result += i['c'].replace('\\r', '\r').replace('\\n', '\n') |
| 56 | + return result |
| 57 | + |
| 58 | + |
| 59 | +def parse_other(doc_id): |
| 60 | + content_url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=" + doc_id + "&pn=1&rn=99999&type=ppt" |
| 61 | + content = fetch_url(content_url) |
| 62 | + url_list = re.findall('{"zoom":"(.*?)","page"', content) |
| 63 | + url_list = [item.replace("\\", '') for item in url_list] |
| 64 | + if not os.path.exists(doc_id): |
| 65 | + os.mkdir(doc_id) |
| 66 | + for index, url in enumerate(url_list): |
| 67 | + content = session.get(url).content |
| 68 | + path = os.path.join(doc_id, str(index) + '.jpg') |
| 69 | + with open(path, 'wb') as f: |
| 70 | + f.write(content) |
| 71 | + print("图片保存在" + doc_id + "文件夹") |
| 72 | + |
| 73 | + |
| 74 | +def save_file(filename, content): |
| 75 | + with open(filename, 'w', encoding='utf8') as f: |
| 76 | + f.write(content) |
| 77 | + print('已保存为:' + filename) |
| 78 | + |
| 79 | + |
| 80 | +# test_txt_url = 'https://wenku.baidu.com/view/cbb4af8b783e0912a3162a89.html?from=search' |
| 81 | +# test_ppt_url = 'https://wenku.baidu.com/view/2b7046e3f78a6529657d5376.html?from=search' |
| 82 | +# test_pdf_url = 'https://wenku.baidu.com/view/dd6e15c1227916888586d795.html?from=search' |
| 83 | +# test_xls_url = 'https://wenku.baidu.com/view/eb4a5bb7312b3169a551a481.html?from=search' |
| 84 | +def main(): |
| 85 | + url = input('请输入要下载的文库URL地址') |
| 86 | + content = fetch_url(url) |
| 87 | + doc_id = get_doc_id(url) |
| 88 | + type = parse_type(content) |
| 89 | + title = parse_title(content) |
| 90 | + if type == 'doc': |
| 91 | + result = parse_doc(content) |
| 92 | + save_file(title + '.txt', result) |
| 93 | + elif type == 'txt': |
| 94 | + result = parse_txt(doc_id) |
| 95 | + save_file(title + '.txt', result) |
| 96 | + else: |
| 97 | + parse_other(doc_id) |
| 98 | + |
| 99 | + |
| 100 | +if __name__ == "__main__": |
| 101 | + main() |
0 commit comments