导航菜单
首页 >  易烊千玺保佑期末考试照片  > 基于Python采集明星照片(含完整代码和图像)

基于Python采集明星照片(含完整代码和图像)

最近在做一个简单的人脸比对项目,需要采集一些真实场景的人脸图像用于测试。目前网上的明星照片比较多,因此写一个Python脚本可以自动获取明星人脸图像,并且使用opencv库对采集的人脸图像进行一定程度的验证、过滤,尽可能保留人脸区域较大的、正面的图像。

完整代码如下:

import osimport sysimport timeimport urllibimport requestsimport refrom bs4 import BeautifulSoupimport timefrom xpinyin import Pinyinimport cv2header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}url = "https://cn.bing.com/images/async?q={0}&first={1}&count={2}&scenario=ImageBasicHover&datsrc=N_I&layout=ColumnBased&mmasync=1&dgState=c*9_y*2226s2180s2072s2043s2292s2295s2079s2203s2094_i*71_w*198&IG=0D6AD6CBAF43430EA716510A4754C951&SFX={3}&iid=images.5599"# 创建人脸检测器face_detecter = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')def findImgUrlFromHtml(html,savepath, rule, count,countNum):'''从缩略图列表页中找到原图的url,并返回这一页的图片数量'''soup = BeautifulSoup(html, "lxml")link_list = soup.find_all("a", class_="iusc")url = []for link in link_list:result = re.search(rule, str(link))# 将字符串"amp;"删除url = result.group(0)# 组装完整urlurl = url[8:len(url)]# 打开网址try:# 爬取图片r = requests.get(url, timeout=10)imgpath = os.path.join(savepath,str(count + 1) + '.jpg')with open(imgpath,'wb') as f:f.write(r.content)# 检索图片有效性img = cv2.imread(imgpath)if img is None:os.remove(imgpath)continue# 检查图片规格h,w,_=img.shapeif w countNum:return countreturn countdef getStartHtml(url, key, sfx):'''获取缩略图列表页'''page = urllib.request.Request(url.format(key, 1, 10, sfx),headers=header)html = urllib.request.urlopen(page)return htmlif __name__ == '__main__':# 定义初始化参数pinyin = Pinyin() # 中文转拼音类实例fileList = list() # 最终的文件列表countNum = 5 # 每个人物爬取数量data_root = 'starimgs' # 图像保存根目录# 读取明星列表f = open("names.txt", encoding='utf-8')nameList = f.read().splitlines()# 逐个人物开始爬取id = 0for ch_name in nameList:print('开始爬取 ' + ch_name +' 的图片')# 获取英文名en_name = pinyin.get_pinyin(ch_name)en_name = en_name.split('-')en_name = en_name[0].capitalize()+'' + \''.join(en_name[1:]).capitalize()# 定义爬取参数key = urllib.parse.quote(ch_name)sfx = 1count = 0# 创建保存路径savepath = os.path.join(data_root, en_name)if not os.path.exists(savepath):os.makedirs(savepath)else:continue# 定义爬取正则式rule = re.compile(r"\"murl\"\:\"http\S[^\"]+")while count

相关推荐: