首页 > python 2.7 unicode

python 2.7 unicode

我使用python爬虫抓取了一段网页
如果直接输出网页,中文会是乱码。
我使用如下方法解码

html.decode('gbk')

现在出现了这样的一个问题。

我抓取的网页类似于博客,我希望将博客的标题作为我保存文件的文件名

但有如下报错

开篇

Traceback (most recent call last):
  File "D:/Users/rongweiwei799/PycharmProjects/untitled1/Fregment Post.py", line 97, in <module>
    f = open('%s' %result[0],'w')
IOError: [Errno 22] invalid mode ('w') or filename: u'\r\n\u5f00\u7bc7\r\n'

“开篇”为标题名

代码如下

# coding:utf-8

import sys
import urllib2
import urllib
import cookielib
import re


reload(sys)
sys.setdefaultencoding('utf-8')


#在此处设置用户名/密码
username = "rongweiwei799"
password = "XXX"



def make_cookie(name, value, domain, path='/'):
    return cookielib.Cookie(version=0,
      name=name,
      value=value,
      port=None,
      port_specified=False,
      domain=domain,
      domain_specified=True,
      domain_initial_dot=False,
      path=path,
      path_specified=True,
      secure=False,
      expires=None,
      discard=False,
      comment=None,
      comment_url=None,
      rest=None,
      rfc2109=False)


#设置Cookie容器 和 User-Agent
__cookie = cookielib.CookieJar()
__req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie))
__req.addheaders = [
  ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
  ('User-Agent', 'Mozilla/5.0,(Windows NT 6.1; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0')
]
urllib2.install_opener(__req)



#先请求首页, 得到Cookie(产生Session会话)
request = urllib2.Request('http://profile.paic.com.cn/profile/')
urllib2.urlopen(request).read()



#用户名转换为大写
username = username.upper()

#构造要提交的数据
postdata = {'j_username': username, 'j_password': password,}



req = urllib2.Request('http://profile.paic.com.cn/profile/j_security_check', urllib.urlencode(postdata))


#增加 Cookie(该Cookie由网页中的JS写入的)
#__cookie.set_cookie(make_cookie('E2E.loginUserName', username, 'emp.paic.com.cn'))

#增加 Referer
req.add_header('Referer', 'http://profile.paic.com.cn/profile/',)


#发出请求
result = urllib2.urlopen(req)

#输出结果
#print result.read().decode('gbk').encode('utf-8')

#Logfile = []

w1 = r'font-weight:bold">'
w2 = r'</div>'
pat = re.compile(w1+'(.*?)'+w2,re.S)

w3 = r'class="divLogContent"[>]'
w4 = r'</div>'
bat = re.compile(w3+'(.*?)'+w4,re.S)


for i in range(1,20):
  Log = urllib2.urlopen('http://profile.paic.com.cn/profile/log/viewlog.shtml?id=%d' %i)
  buff = Log.read().decode('gbk')
  result = pat.findall(buff)
  if result:
    print result[0]
    f = open('%s' %result[0],'w')
    Result = re.sub(r'[</]+?p>','',buff)
    #print Result
    content = bat.findall(Result)
    for i in content:
      print i
    f.close()

Html源代码如下

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<title>PROFILE系统</title>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=GBK"/>
<script type="text/javascript" src="/profile/js/jquery-1.7.1.min.js"></script>
<link rel="stylesheet" href="/profile/style/style.css" type="text/css" media="screen, projection" />
<link rel="stylesheet" type="text/css" href="/profile/ke/plugins/code/prettify.css" />
<script type="text/javascript" charset="utf-8" src="/profile/ke/plugins/code/prettify.js"></script>
<script type="text/javascript">
$(function(){
    prettyPrint();
});
</script>
<link rel="stylesheet" href="/profile/style/jqModal.css" type="text/css" media="screen, projection"/>
<script type="text/javascript" src="/profile/js/jqModal.js"></script>
<script type="text/javascript" src="/profile/js/util.js"></script>
<script type="text/javascript" src="/profile/js/jquery.form.js"></script>
<script type="text/javascript" src="/profile/js/input_position.js"></script>
<script type="text/javascript" src="/profile/js/textarea_edit.js"></script>
<script type="text/javascript" src="/profile/js/jquery.lightbox-0.5.js"></script>
<link rel="stylesheet" type="text/css" href="/profile/style/jquery.lightbox-0.5.css" media="screen" />

<script type="text/javascript">
var weibo_maxlen = 200;
$(function(){
    
    $('#txtReplyContent').keydown(function(){
        var word_len = $(this).val().length;
        $('#spnWeiboLength').text(weibo_maxlen - word_len >= 0 ? weibo_maxlen - word_len : 0);
        if(word_len > weibo_maxlen){
            $(this).val($(this).val().substring(0, weibo_maxlen));
            return false;
        }
    });

    $('#favorite').click(function(){
        if(confirm('确定收藏吗?')){
            var targetId = $(this).attr('targetId');
            var targetType = $(this).attr('targetType');
            $.get('/profile/user/addFavorite.shtml',{targetId:targetId,targetType:targetType},function(data){
                if($.trim(data) == 'ok'){
                    showPopMsg('收藏成功!');
                    $('#favoriteCount').text(parseInt($('#favoriteCount').text(),10) + 1);
                }else{
                    showPopMsg($.trim(data));
                }
            });
        }
        
    });

    $('body').click(function(){
        $('#faceBox').hide();
    });
    
    loadReply();
    
    $('.divLogContent img').lightBox();
});

function loadReply(){
    $('#divReply').load('/profile/reply/showReply.shtml?rt=LOG&rtid=5', function(){
        $('#spnReplyCount').text($("#hidReplySize").val());
    });
}

var replySaveOption = {
        index:0,
        btn:null,
        txt:null,
        spn:null,
        url: '/profile/reply/saveReply.shtml',
        beforeSubmit:function(a,b,c){
            if($.trim(this.txt.val()) != ''){
                this.spn.text('').hide();
                $('#spnMsg0').fadeIn('slow');
                setTimeout("$('#spnMsg0').fadeOut('slow')",3000);
                this.btn.attr('disabled', 'true');
            }else{
                this.spn.text('说点什么吧!').show();
                return false;
            }
        },
        success: function(data, status) {
            if($.trim(data) == '1'){
                this.btn.removeAttr('disabled');
                this.txt.val('');
                loadReply();
                }
        } 
    };

function saveReply(index){
    if($('#hidReplyType' + index).val() != ''){
        replySaveOption.index = index;
        replySaveOption.btn = $('#btnSaveReply' + index);
        replySaveOption.txt = $('#txtReplyContent' + index);
        replySaveOption.spn = $('#spnWarn' + index);
        $('#formReply' + index).ajaxSubmit(replySaveOption);
    }
}
</script>
</head>
<body>

<script type="text/javascript">
    $(function(){
        checkNotify();
        
        getUserNeedDoTaskCount();
        setInterval('getUserNeedDoTaskCount()', 60000);
        window.setInterval('checkNotify();', 15 * 1000);
        $('#btnSearchAll').click(function(){
            var keyword = $.trim($('#txtKeyword').val());
            if(keyword != ''){
                $('#k').val(escape(keyword));
                $('#formSearch').attr('action','/profile/search/searchindex.shtml').submit();
            }else{
                alert('请先输入搜索的内容');
                return;
            }
        });
        $('#searchCategory').click(function(){
            var offset = $(this).offset();
            $('#searchCategorysBox').toggle();
            $('#searchCategorysBox').offset({top : offset.top + 24, left : offset.left});
        });
        $('#searchCategorysBox a').click(function(){
            var t = $(this).attr('t');
            $('#t').val(t);
            $('#searchCategory').text($(this).text());
            $('#searchCategorysBox').hide();
        });

        $('.taskCountTip').click(function(){
            var taskCount = $(this).text();
            if(taskCount > 0){
                var taskType = $(this).attr('target');
                var color = {'CASE':'#66CCFF','CHANGE':'#8F2C11','REQUEST':'green','STEP':'#FFAA00'};
                var taskListBox = $('#taskListBox');
                var coverBox = $('#coverBox');
                taskListBox.load('/profile/user/getUserNeedToDoList.shtml?type='+taskType,function(){
                    var offset = $('.headLinkBox').offset();
                    taskListBox.css({width:'700px',top : offset.top + 40,left : offset.left - 3,border : '1px solid #333'});
                    taskListBox.slideDown("slow");
                    //$('.tbTaskList tr:first-child').css({'background':'#EEE','border-top':'2px solid ' + eval('color.' + taskType)});
                    //$('.tbTaskList tr').css({'border-bottom':'1px solid #CCC'});
                    coverBox.show();
                    coverBox.height($('body').height());
                    $('#slideup').click(function(){
                        $('#taskListBox').slideUp("slow").delay(800);
                        coverBox.hide();
                    });
                });
            }
        });
        
    });
    
    function ajaxCallback(){
        if($.trim($('#divNotify').text()) != ''){
            $('#divNotify').slideDown();
        }else{
            $('#divNotify').slideUp();
        }
        
        if($("#hidHonorIDs").length > 0 && $("#hidHonorIDs").val() != ''){
            if($('#divHonor').is(':hidden')){
                showHonor(1, $("#hidHonorIDs").val());
            }
        }
    }
    
    function checkNotify(){
        asyLoadDataHtml('/profile/user/getUserNotify.shtml','','divNotify', ajaxCallback);
    }
    
    function showHonor(page, hids){
        $('#divHonor #ifrmHonor').attr('src','/profile/honor/getUserNotify.shtml?p='+ page +'&hids=' + hids);
        $("#divHonor").jqm({modal : true,width : 500}).jqmShow();
    }

    function closeHonorDialog(){
        $("#divHonor").jqm().jqmHide();
    }
    
    function getUserNeedDoTaskCount(uid){
        var url = "/profile/user/getUserNeedDoCount.shtml";
        if(uid){
            url += "?uid=" + uid;
        }
        asyLoadDataJson(url, '', function(data){
            for(var key in data){
                if($('#spnCount' + key).hasClass('divHeadTitleUp')){
                    if(data[key] != 0){
                        $('#spnCount' + key).text(data[key]).addClass('divHeadTitleInfo');
                    }else{
                        $('#spnCount' + key).text('').removeClass('divHeadTitleInfo');
                    }
                }else{
                    $('#spnCount' + key).show();
                    //$('#spnCount' + key).add($('#spnCount' + key).parent()).css({'background-color': '#B74939'});
                    $('#spnCount' + key).text(data[key]);
                    if(data[key] == 0){
                        $('#spnCount' + key).hide();
                        //$('#spnCount' + key).add($('#spnCount' + key).parent()).css({'background-color': ''});
                    }
                }
            }
        });
    }
    
</script>
<div id="divNotify" style="display: none;"></div>
<div class="jqmWindow" id="divHonor" style="padding:0px;background:transparent;border:0px">
<iframe style="height:320px;width:100%;background:transparent" scrolling="no" frameborder="0" id="ifrmHonor" allowtransparency="true"></iframe>
</div>

<div id="head">
    <div style="width: 1004px;margin-left: auto;margin-right: auto;height:40px;padding:0px">
        <div class="d_f" style="margin:0px;padding:0px;height:40px">
            <a href="/profile/welcome.shtml" style="padding:0px"><img src="/profile/images/logo.jpg" border="0" /></a>
        </div>
        <div style="padding:0px;margin:0px" class="d_f">
        <div>
        <div style="margin:0px" class="divHeadTitleUp d_f"></div>
        <div class="divHeadTitleUp d_f"></div>
        <div class="divHeadTitleUp d_f"></div>
        <div class="divHeadTitleUp d_f"></div>
        <div class="divHeadTitleUp d_f"></div>
        <div class="divHeadTitleUp d_f"></div>
        <div class="divHeadTitleUp d_f"></div>
        <div class="divHeadTitleUp d_f" id="spnCountPROJECT"></div>
        <div class="clear"></div>
        </div>
        <div class="headLinkBox">
            <a href="/profile/home/myhome.shtml">首页</a>
            |
            <a href="/profile/log/mylog.shtml">日志</a>
            |
            <a href="/profile/weibo/getWeiboView.shtml?uid=RONGWEIWEI799">动态</a>
            |
            <a href="/profile/user/myfriend.shtml">关注</a>
            |
            <a href="/profile/reply/getReplyMeList.shtml">评论</a>
            |
            <a href="/profile/honor/getMyHonorView.shtml?cmd=in">评价</a>
            |
            <a href="/profile/vote/queryVoteList.shtml">投票</a>
            |
            <a href="/profile/project/myprojectlist.shtml?type=minedo">项目</a>
        </div>
        </div>
        <div class="headTaskTips">
            <a href="javascript:void(0);" class="caseTip"><span id="spnCountCASE" target="CASE" class="taskCountTip" title="待处理CASE数" style="display: none;"></span></a>
            <a href="javascript:void(0);" class="requestTip"><span id="spnCountREQUEST" target="REQUEST" class="taskCountTip" title="待处理REQUEST数" style="display: none;"></span></a>
            <a href="javascript:void(0);" class="changeTip"><span id="spnCountCHANGE" target="CHANGE" class="taskCountTip" title="待处理CHANGE数" style="display: none;"></span></a>
            <a href="javascript:void(0);" class="stepTip"><span id="spnCountSTEP" target="STEP" class="taskCountTip" title="待处理STEP数" style="display: none;"></span></a>
        </div>
        <div class="searchItem">
        <img src="/profile/images/search.png" style="cursor:hand;vertical-align:bottom" id="btnSearchAll"/>
        </div>
        <div class="searchItem">
        <input type="text" id="txtKeyword" name="txtKeyword" style="vertical-align:bottom;padding-bottom:0;padding-bottom:1px\9;width: 110px;"/>
        <form method="get" id="formSearch" target="_blank">
            <input type="hidden" id="k" name="k"/>
            <input type="hidden" id="t" name="t" value="LOG"/>
        </form>
        </div>
        <div class="searchItem">
            <a href="javascript:void(0);" id="searchCategory">日志</a>
        </div>
        <div id="searchCategorysBox">
            <a href="javascript:void(0);" t="LOG">日志</a>
            <a href="javascript:void(0);" t="user">用户</a>
            <a href="javascript:void(0);" t="WEIBO">动态</a>
            <a href="javascript:void(0);" t="HONOR">评价</a>
            <a href="javascript:void(0);" t="PROJECT">项目</a>
        </div>
        <div class="clear"></div>
    </div>
</div>
<div id="taskListBox" style="position:absolute;display: none;background: #FFF;overflow:hidden;z-index:99;">
</div>
<div id="coverBox" style="position:absolute;display:none;left:0;top:0;width:100%;height:100%;background:#000;opacity: 0.8;filter:alpha(opacity=80);z-index:90;"></div>
<div id="main">





<script type="text/javascript">
    $(function(){
        $('#btnCancelFriend').click(function(){
            var userID = $('#hidUserID').val();
            if(confirm('确认取消关注' + userID + "?")){
                $.post(
                    '/profile/user/cancelFriend.shtml?fid=' + userID,
                    function(data){
                        if($.trim(data) == '1'){
                            window.location.reload();
                        }else{
                            msgBox('取消失败,请联系管理员!');
                        }
                    }
                );
            }
        });
        
        $('#btnMakeFriend').click(function(){
            var userID = $('#hidUserID').val();
            $.post(
                '/profile/user/insertFriend.shtml?fid=' + userID,
                function(data){
                    if($.trim(data) == '1'){
                        window.location.reload();
                    }else{
                        msgBox('关注失败,请联系管理员!');
                    }
                }
            );
        });
        
        $('#btnGood').click(function(){
            var userID= $("#hidUserID").val();
            location.href="/profile/honor/addHonorView.shtml?uid="+userID+"&i=1";
        });
        
        $('#btnBad').click(function(){
            var userID= $("#hidUserID").val();
            location.href="/profile/honor/addHonorView.shtml?uid="+userID+"&i=-1";
        });
    });
</script>
<div id="left">
<input type="hidden" id="hidUserID" value="LINCHENG530">
<div style="background:#9B0200;"></div>
<div id="leftmenu" class="main_menu" style="padding:5px;">
    <div class="" style="width:100%;padding-left:13px;padding-bottom:5px">
        <div>
            <span class="face_bg_mid">
            <img class="faceMid" src="/profile/user/getUserFace.shtml?uid=LINCHENG530&type=big"/>
            </span>
        </div>
        <div>

<input type="button" id="btnMakeFriend" value="关注" class="btn3"/>


</div>
    </div>

</div>
</div>
 

<div id="middle">
<div id="title">
<img src="/profile/images/page_text.gif"/>&nbsp;林城的日志
</div>

<div class="d_l_1" style="width:650px;">
<div class="divLogTitle" style="padding:5px;line-height:20px;border-bottom:1px solid #ddd;border-top:1px solid #ccc;background:#eee;height:20px;font-weight:bold">
开篇
</div>
<div style="text-align:right;height:20px;line-height:20px;color:#666">
2012-09-12 14:57:15
分类:未分类
&nbsp;权限:完全公开
</div>
<div class="divLogContent"><p>
    开篇第一章:
</p>
<p>
    &nbsp;&nbsp;&nbsp; profile第一版上线测试!
</p>
<br />
</div>
<div style="color:#666">
    转发[0]&nbsp;&nbsp;
    评论[<span id="spnReplyCount"></span>]&nbsp;&nbsp;
    <a href="javascript:void(0);" id="favorite" targetId="5" targetType="LOG">收藏[<span id="favoriteCount">0</span>]</a>
</div>
<div style="border-bottom:1px solid #eee">
<div class="d_f">
<span class="face_bg_small">
<img src="/profile/user/getUserFace.shtml?uid=RONGWEIWEI799&type=small"/>
</span>
</div>
<div class="d_f">
<form method="post" id="formReply0">
<div>
<textarea name="txtReplyContent" id="txtReplyContent0"
        onkeydown="return editorKeyDown();"
        onkeyup="editorKeyUp(this);"
        onclick="editorClick(this);" style="width:500px;height:150"></textarea>
</div>
<div style="text-align:right;padding:2px">
<span class="d_f" style="margin:0px;color:#666">剩余字符<span id="spnWeiboLength">200</span></span>
<span class="d_f" style="margin:0;"><img title="表情" width="20" target="txtReplyContent0" class="imgFace" onclick="addFace($(this))" style="position:relative;top:-2px;" src="/profile/images/face/14.gif" /></span>
<span id="spnWarn0" class="spnWarn" style="display:none;"></span>
<span id="spnMsg0" class="spnMsg" style="display:none;">发布成功!</span>
<input type="checkbox" class="chkbox" name="chkWeibo" />同时转发
<input type="button" class="btn1" id="btnSaveReply0" value="评论" onclick="saveReply('0');"></input></div>
<input type="hidden" value="5" name="hidReplyToID" id="hidReplyToID"></input>
<input type="hidden" value="LINCHENG530" name="hidReplyToOwner"></input>
<input type="hidden" value="LOG" name="hidReplyType" id="hidReplyType0"></input>
<input type="hidden" value="0" name="hidReplyOrginalID"></input>
</form>
</div>
<div class="clear"></div>
</div>
<div id="divReply">

</div>
</div>

<!-- 分类 -->
<div class="d_r_1">
<div id="divLogType">
<div style="font-weight:bold">日志分类:</div>
<div>
<ul style="padding:0px">

    <li><a href="/profile/log/otherlog.shtml?uid=LINCHENG530&t=104">工作点滴</a></li>


<li><a href="/profile/log/otherlog.shtml?uid=LINCHENG530&t=0">未分类</a></li>
<li><a href="/profile/log/otherlog.shtml?uid=LINCHENG530">全部日志</a></li>

</ul>
</div>
</div>
</div>

</div>

<div class="clear"></div>
</div>
<div id="foot">
&copy;2012 基础架构应用平台领域 TK.Unit Some rights reserved.
</div>

</body>
</html>

u'\r\n\u5f00\u7bc7\r\n'\r\n 是回车换行,因此是无效文件名,和编码无关。

【热门文章】
【热门文章】