首页 > 如何获取这个网站(http://www.itslaw.com/)搜索目标的标题,它是由js加载的,试了很多方法都不行。。。

如何获取这个网站(http://www.itslaw.com/)搜索目标的标题,它是由js加载的,试了很多方法都不行。。。

# -*- coding: utf-8 -*-
import requests
from pyquery import PyQuery as pq
from goose import Goose
from goose.text import StopWordsChinese
import json
import time


class ItSlaw(object):

    def __init__(self):
        self.url = 'http://www.itslaw.com/api/v1/caseFiles?startIndex=0&countPerPage=20&sortType=1&conditions=searchWord+{keyword}+1+{keywordcopy}'
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36",
                        "Accept": "application/json, text/plain, */*",
                        "Accept-Encoding": "gzip, deflate, sdch",
                        "Accept-Language": "zh-CN,zh;q=0.8",
                        "Cache-Control": "no-cache",
                        "Connection": "keep-alive",
                        "Host": "www.itslaw.com",
                        "If-Modified-Since": "Mon, 26 Jul 1997 05:00:00 GMT",
                        "Pragma": "no-cache",
                        "Referer": "http://www.itslaw.com"}
        self.result = None
        self.keyword = None
        self.session = requests.Session()

    def reset(self, keyword):
        self.keyword = keyword
        self.result = None

    def fetch(self):
        url = self.url.format(keyword='self.keyword', keywordcopy='self.keyword')
        res = []
        time.sleep(3)
        proxies = {"http": "14.111.148.1"}
        r = self.session.get(url, proxies=proxies)
        print r.status_code
        print '@@'*20
        completed_url = 'http://www.itslaw.com/' + 'url'
        g = Goose({'stopwords_class': StopWordsChinese})
        article = g.extract(url=completed_url)
        content = article.cleaned_text
        res.append({'title': title, 'url': url, 'content': content})
        self.result = res
        return self.result

    def get_result(self):
        return self.result


if __name__ == '__main__':
    search = ItSlaw()
    search.reset('九州通医药集团股份有限公司')
    search.fetch()
    info = search.get_result()
    print info

用js加载的内容 建议用chrome去看一下是哪个异步请求获取到的数据,然后再模拟Js发出请求,解析结果

【热门文章】
【热门文章】