import requests
import logging
import parsel
import re
import os
headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0' } logging. basicConfig( level= logging. INFO, format = '%(asctime)s - %(levelname)s: %(message)s' )
TOTAL_PAGE = 115
def changer_title ( name) : new_name = re. sub( r'[/\\:*?"<>|\b]' , '_' , name) return new_name
def scrape_page ( url) : logging. info( '正在获取源代码数据 %s' , url) try : response = requests. get( url, headers= headers) response. encoding = 'gb2312' if response. status_code == 200 : return response. textlogging. error( '抓取时获取无效状态码 %s' , response. status_code, url) except requests. RequestException: logging. error( '抓取时发生错误 %s' , url, exc_info= True )
def scrape_index ( page) : index_url = f'https://www.shijuan1.com/a/sjsx7/list_124_ { page} .html' return scrape_page( index_url)
def parse_index ( html) : selector = parsel. Selector( text= html) titles = selector. css( 'tr td:first-child a.title::text' ) . getall( ) links = selector. css( 'tr td:first-child a.title::attr(href)' ) . getall( ) for title, link in zip ( titles, links) : title = changer_title( title) link = 'https://www.shijuan1.com' + linkresponse = requests. get( url= link, headers= headers) response. encoding = 'gb2312' html_text = re. findall( '<li><a href="(.*?)" target="_blank">本地下载</a></li>' , response. text) [ 0 ] html_text = 'https://www.shijuan1.com' + html_textcontent = requests. get( url= html_text, headers= headers) . contentwith open ( html_file + title + '.rar' , mode= 'wb' ) as f: f. write( content)
if __name__ == '__main__' : html_file = '爬取内容/' if not os. path. exists( html_file) : os. mkdir( html_file) for page in range ( 1 , TOTAL_PAGE + 1 ) : index_html = scrape_index( page) if index_html: parse_index( index_html)