报错的信息如下:
UnicodeEncodeError: 'gbk' codec can't encode character 'xa0' in position 21: illegal multibyte sequence
代码信息如下:
import urllib.request
import re
import csv
import datetime
import time
import requests
starttime = datetime.datetime.now()
city=input('请输入城市的首字母,例如:北京-BJ(不区分大小写):')
print('下载中...')
#列表
results=[]
for number in range(100):
url='http://'+city+'.liepin.com/zhaopin/pn'+str(number)+'/'
#模拟浏览器
response=requests.get(url,timeout=3)
html=response.text
if response.encoding=='UTF-8':
#正则表达式
pattern=re.compile('<li>.*?job-info.*?href="(.*?)".*?<span>(.*?)</span>.*?clearfix" title="(.*?)".*?<time>(.*?)</time>.*?company-name.*?"公司(.*?)"',re.S)
items=re.findall(pattern,html)
results.append(items)
print('当前下载第:'+str(number+1)+'页!')
else:
pass
#输出csv
csvfile=open(city+time.strftime('%m%d') +'.csv','w+',newline ='')
try:
writer=csv.writer(csvfile)
writer.writerow((['公司名称','职位名称','职位链接','职位说明','发布时间']))
for n in range(len(results)):
for m in range(len(results[n])):
writer.writerow([results[n][m][4],results[n][m][1],results[n][m][0],results[n][m][2],results[n][m][3]])
finally:
csvfile.close()
每一行的内容里面,如果有中文,需要进行utf-8编码
比如你可以在col_of_chn里指定中文列的index,
每当你write row时候,对该行的中文进行编码
def encode2printcsv(line, col_of_chn):
for i in col_of_chn:
#if isinstance(line[i],unicode)
line[i] = line[i].encode('utf-8')
return line
list_of_encode_list = [encode2printcsv(x, col_of_chn) for x in list_of_list]
for x in list_of_encode_list:
#print x
csv_writer.writerow(x)
第一行应该声明指定的编码:
# coding=UTF-8