首页 > 小白求助,python3.5编码问题

小白求助,python3.5编码问题

新手学习python,网上copy了一段爬虫,求助攻!
环境:win8、sublime text3
原文:http://blog.csdn.net/omuyejingfeng1/article/details/24182313

# -*- coding: utf-8 -*-
import locale
import urllib.request as request  
import urllib.parse as parse  
import string  
import re  
import os  
import urllib.error as error  
print(u""" 
+++++++++++++++++++++++ 
  学校:超神学院 
  专业:德玛班 
  姓名:德玛之力 
  version: python3.2 
+++++++++++++++++=++++ 
     """)  
def baidu_tieba(url, begin_page, end_page):  
    count = 1  
    for i in range(begin_page, end_page + 1):  
        sName = 'f:/test/'+str(i).zfill(5)+'.html'  
        print (locale.getdefaultlocale())
        print (u'正在下载第'+str(i)+u'个页面, 并保存为'+sName)
        m = request.urlopen(url+str(i)).read()  
        #创建目录保存每个网页上的图片  
        dirpath = 'f:/test/'  
        dirname = str(i)  
        new_path = os.path.join(dirpath, dirname)  
        if not os.path.isdir(new_path):  
            os.makedirs(new_path)  
        page_data = m.decode('GBK')     
        page_image = re.compile('<img src=\"(.+?)\"')  
        for image in page_image.findall(page_data):  
            pattern = re.compile(r'^http://.*.png$')  
            if  pattern.match(image):  
                try:  
                    image_data = request.urlopen(image).read()  
                    image_path = dirpath + dirname +'/'+str(count)+'.png'  
                    count += 1  
                    print(image_path)  
                    with open(image_path, 'wb') as image_file:  
                        image_file.write(image_data)  
                    image_file.close()  
                except error.URLError as e:  
                    print('Download failed')  
        with open(sName,'wb') as file:  
            file.write(m)  
        file.close()  
if __name__ == "__main__":  
    url = "http://tieba.baidu.com/p/"  
    begin_page = 1  
    end_page = 3  
    baidu_tieba(url, begin_page, end_page)  

输出乱码:

+++++++++++++++++++++++ 
  ѧУ■■■■■ѧԺ 
  רҵ■■■■■ 
  ■■■■■■■■■■ 
  version: python3.2 
+++++++++++++++++=++++ 
     
('zh_CN', 'cp936')
■■■■■■■■ҳ■, ■■■■■■■f:/test/00001.html
Traceback (most recent call last):
  File "E:\python\test_3.py", line 52, in <module>
    baidu_tieba(url, begin_page, end_page)  
  File "E:\python\test_3.py", line 30, in baidu_tieba
    page_data = m.decode('GBK')     
UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 3697: illegal multibyte sequence
>>> 

UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 3697: illegal multibyte sequence

编码问题,看看原网页是什么编码


我运行你的代码是没有问题的,也没有乱码,综合分析,是你的sublime text3 中文乱码,因为你的sublime text3 它默认不支持GBK的编码格式,跟原网页,系统编码是没有关系的,可以使用IDLE验证,解决办法,就是安装插件。

【热门文章】
【热门文章】