爬虫
文档与安装
requests
https://requests.readthedocs.io/en/latest/
BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
pip install requests
pip install beautifulsoup4
获取网页
import requests
url = 'https://www.xingyueboke.com/sanguoyanyi/'
response = requests.get(url)
response.encoding = 'utf-8'
html_content = response.text
# print(html_content)
提取信息
- css selector (beautifulsoup)
- XPath (lxml)
soup = BeautifulSoup(html_content, 'html.parser')
indexArr = []
for book_list in soup.find_all("div", class_="book-list"):
for book in book_list.ul.find_all("li"):
print(book.a.text)
print(book.a.get('href'))
indexArr.append(book.a.get('href'))
# if href.startswith('http'):
# print(href)
for index in indexArr:
response = requests.get(index)
response.encoding = 'utf-8'
html_content = response.text
# print(html_content)
soup = BeautifulSoup(html_content, 'html.parser')
re = soup.select("#nr1")
for i in re:
print(i.text)
保存信息
def save_to_file(contents):
with open('chapters2.txt', 'a', encoding='utf-8') as f:
for content in contents:
f.write(content.text + '\r\n\n')