我用simple_html_dom爬取网页,原网页的编码是gb2312,用mb_convert_encoding
转换编码为utf-8
mb_convert_encoding($innertext, 'UTF-8', 'GB2312');
爬取地址
http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013
核心代码
$shd = new simple_html_dom();
$shd->load_file('http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013');
$playerNodes = $shd->find('table#DataGrid2 tr');
unset($playerNodes[0]);
foreach ($playerNodes as $ky => $playerNode) {
//查找节点
$playerNodeTds = $playerNode->children();
$playerNodeA = $playerNodeTds[0]->children();
$playerId = explode('=', $playerNodeA[0]->href);
//获取内容
$player['team_id'] = $team['team_id'];
$player['player_id'] = $playerId[1];
$player['player_name'] = mb_convert_encoding($playerNodeA[0]->innertext, 'UTF-8', 'GB2312');
$player['number'] = $playerNodeTds[1]->innertext;
$player['birthday'] = mb_convert_encoding($playerNodeTds[2]->innertext, 'UTF-8', 'GB2312');
$player['position'] = mb_convert_encoding($playerNodeTds[3]->innertext, 'UTF-8', 'GB2312');
$player['height'] = $playerNodeTds[4]->innertext;
$player['weight'] = $playerNodeTds[5]->innertext;
var_dump($player['player_name']);
}
结果
string(6) "李航"
string(9) "周启新"
string(6) "郭磊"
string(9) "赵泰隆"
string(4) "孙?"
string(9) "孙伟博"
string(9) "谢亚财"
string(9) "贾俊龙"
string(9) "陈林坚"
string(9) "王哲林"
string(9) "黄毅超"
string(9) "王增杰"
string(17) "法迪·哈提布"
string(16) "杰里米-泰勒"
string(20) "德怀特·拜克斯"
可以看到 string(4) "孙?"
没有转换过来,正确的是孙喆
。
大部分转换正常,但是有些字转不过来,会转成问号——?,请教各位大神有什么好办法解决这个问题?
非常感谢~
试下这样看看?喆
属于生僻字,不在gb2312中,在gbk中存在。GB18030字符集兼容GBK
mark下生僻字集合:劼,晅,虞,崟,珺,祎,鏐,勍,璟,芃,夐,昱,昉,昳,旸,睿,崑,翀,弋,嬿,贇,喆
<?php
set_time_limit(0);
require_once('simple_html_dom.php');
//$page = file_get_contents('http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013');
//$page=iconv('GBK', 'UTF-8', $page);
//echo $page;exit();
$shd = new simple_html_dom();
//$shd->load($page);
$shd->load_file('http://www.cba.gov.cn/cbastats/teamdetail.aspx?id=Te013');
$playerNodes = $shd->find('table#DataGrid2 tr');
unset($playerNodes[0]);
foreach ($playerNodes as $ky => $playerNode) {
//查找节点
$playerNodeTds = $playerNode->children();
$playerNodeA = $playerNodeTds[0]->children();
$playerId = explode('=', $playerNodeA[0]->href);
//获取内容
$player['team_id'] = $team['team_id'];
$player['player_id'] = $playerId[1];
$player['player_name'] = $playerNodeA[0]->innertext;
$player['number'] = mb_convert_encoding($playerNodeTds[1]->innertext, 'GBK', 'GB2312');
$player['birthday'] = mb_convert_encoding($playerNodeTds[2]->innertext, 'GBK', 'GB2312');
$player['position'] = mb_convert_encoding($playerNodeTds[3]->innertext, 'GBK', 'GB2312');
$player['height'] = $playerNodeTds[4]->innertext;
$player['weight'] = $playerNodeTds[5]->innertext;
var_dump($player['player_name']);
}