requests库、re、html、xlwt
from bs4 import BeautifulSoup #解析网页import re #正则表达式,进行文字匹配import urllib.request,urllib.error #制定url,获取网页数据import xlwt #进行excel操作import sqlite3 #进行SQLite数据库操作 2、申请访问网页 def askURL(url):head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36"} #伪装成网页的形式,请求网页信息request = urllib.request.Request(url,headers=head)html = ""try:response = urllib.request.urlopen(request)html = response.read().decode("utf-8")#print(html)except urllib.error.URLError as e:if hasattr(e,"code"):print(e.code)if hasattr(e,"reason"):print(e.reason)return htmlurl:想要爬取的网址
User-Agent获取方法:
3、获取数据(正则表达式) #电影