首页 > simple_html_dom乱码问题

simple_html_dom乱码问题

我用simple_html_dom爬取网页,原网页的编码是gb2312,用mb_convert_encoding转换编码为utf-8

mb_convert_encoding($innertext, 'UTF-8', 'GB2312');

爬取地址

http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013

核心代码

$shd = new simple_html_dom();
$shd->load_file('http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013');
$playerNodes = $shd->find('table#DataGrid2 tr');
unset($playerNodes[0]);
foreach ($playerNodes as $ky => $playerNode) {
    //查找节点
    $playerNodeTds = $playerNode->children();
    $playerNodeA = $playerNodeTds[0]->children();
    $playerId = explode('=', $playerNodeA[0]->href);
    
    //获取内容
    $player['team_id'] = $team['team_id'];
    $player['player_id'] = $playerId[1];
    $player['player_name'] = mb_convert_encoding($playerNodeA[0]->innertext, 'UTF-8', 'GB2312');
    $player['number'] = $playerNodeTds[1]->innertext;
    $player['birthday'] = mb_convert_encoding($playerNodeTds[2]->innertext, 'UTF-8', 'GB2312');
    $player['position']  = mb_convert_encoding($playerNodeTds[3]->innertext, 'UTF-8', 'GB2312');
    $player['height'] = $playerNodeTds[4]->innertext;
    $player['weight'] = $playerNodeTds[5]->innertext;

    var_dump($player['player_name']);
}

结果


string(6) "李航"
string(9) "周启新"
string(6) "郭磊"
string(9) "赵泰隆"
string(4) "孙?"
string(9) "孙伟博"
string(9) "谢亚财"
string(9) "贾俊龙"
string(9) "陈林坚"
string(9) "王哲林"
string(9) "黄毅超"
string(9) "王增杰"
string(17) "法迪·哈提布"
string(16) "杰里米-泰勒"
string(20) "德怀特·拜克斯"

可以看到 string(4) "孙?"没有转换过来,正确的是孙喆

大部分转换正常,但是有些字转不过来,会转成问号——?,请教各位大神有什么好办法解决这个问题?

非常感谢~


试下这样看看?属于生僻字,不在gb2312中,在gbk中存在。GB18030字符集兼容GBK

mark下生僻字集合:劼,晅,虞,崟,珺,祎,鏐,勍,璟,芃,夐,昱,昉,昳,旸,睿,崑,翀,弋,嬿,贇,喆

<?php
set_time_limit(0);

require_once('simple_html_dom.php');

//$page = file_get_contents('http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013');
//$page=iconv('GBK', 'UTF-8', $page);
//echo $page;exit();
$shd = new simple_html_dom();
//$shd->load($page);
$shd->load_file('http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013');
$playerNodes = $shd->find('table#DataGrid2 tr');
unset($playerNodes[0]);
foreach ($playerNodes as $ky => $playerNode) {
    //查找节点
    $playerNodeTds = $playerNode->children();
    $playerNodeA = $playerNodeTds[0]->children();
    $playerId = explode('=', $playerNodeA[0]->href);
    
    //获取内容
    $player['team_id'] = $team['team_id'];
    $player['player_id'] = $playerId[1];
    $player['player_name'] = $playerNodeA[0]->innertext;
    $player['number'] = mb_convert_encoding($playerNodeTds[1]->innertext, 'GBK', 'GB2312');
    $player['birthday'] = mb_convert_encoding($playerNodeTds[2]->innertext, 'GBK', 'GB2312');
    $player['position']  = mb_convert_encoding($playerNodeTds[3]->innertext, 'GBK', 'GB2312');
    $player['height'] = $playerNodeTds[4]->innertext;
    $player['weight'] = $playerNodeTds[5]->innertext;

    var_dump($player['player_name']);
}
【热门文章】
【热门文章】