首页 > 如何用程序获取一件商品的具体信息

如何用程序获取一件商品的具体信息

如何根据输入商品名如书名,获取具体的出品公司,出版社。从京东,当当上可以拉取吗。豆瓣那么多数据是怎么哪里拉取的?


到www.nlc.gov.cn国家图书馆或者类似的网站查询ISBN, 然后你用程序实现这个动作就好了。


因为我一直都比较喜欢看书,所以我希望自己能够开发一个小工具,自动的帮我收集书籍的信息。但是一直没能找到很好的数据源。

根据 @Kslr 的提示,从国家图书馆抓取我要的信息,我用Jsoup写了一份代码,能够抓取到图书的基本信息了。

豆瓣开放了自己的API,我们可以直接使用他们的SDK图书API来查询相关信息的。

当当的也有一个API,京东现在也在搞云计算,所以也都是有自己的SDK的。需要我们学习一下。

下面是例子,你可以参考参考。如果我们有共同目标,你还可以联系我,我们可以讨论讨论。

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Map;

/**
 * Created by pingjiang on 14-6-22.
 *
 */
public class NLCSearcher {
    private static final String URL_BASE = "http://find.nlc.gov.cn";
    private static final String URL = URL_BASE + "/search/doSearch";
    private static final String URL_AJAX = URL_BASE + "/search/ajaxSearch";

    static String[] TARGET_FIELDS_LOGS = new String[] { "全部字段", "题名", "责任者", "关键词", "出版商", "ISBN" };
    static String[] TARGET_FIELDS = new String[] { "", "alltitle", "allcreator", "keywords", "publisher", "identifer" };

    public void search(NLCResult nlcResult) throws NLCException {
        search(nlcResult.getSearchCriteria(), nlcResult);
    }

    public void search(SearchCriteria searchCriteria, NLCResult nlcResult) throws NLCException {
        try {
            parse(Jsoup.parse(new URL(searchCriteria.buildURL()), 10000), nlcResult);
        } catch (IOException e) {
            throw new NLCException(e);
        }
    }

    private void parse(Document doc, NLCResult nlcResult) throws NLCException {
        Elements elements = doc.select("div#searchresult-items");
        if (elements == null || elements.size() == 0) {
            throw new NLCException("div#searchresult-items element is not found");
        }
        Element element = elements.first();
        Elements items = element.select("div.item");
        for (Element item : items) {
            NLCResultItem nlcResultItem = new NLCResultItem();
            nlcResultItem.url = evalJS(item.select("div.info h4 a").attr("onclick"));
            nlcResultItem.imageURL = evalJS(item.select("div.img a").attr("onclick"));
            nlcResultItem.title = item.select("div.info h4 a").text();
            Elements infoElements = item.select("div.info p");
            if (infoElements.size() == 4) {
                nlcResultItem.type = infoElements.get(0).select("em").first().text();
                nlcResultItem.author = infoElements.get(1).select("em").first().text();
                nlcResultItem.publishYear = Integer.valueOf(infoElements.get(2).select("span.pubDate em").text());
                nlcResultItem.publisher = infoElements.get(2).select("span.pub em").text();
                nlcResultItem.dataSource = infoElements.get(3).select("em").first().text();
            }
            nlcResultItem.summary = evalJS(item.select("div.info div.moreNav a#summary").attr("onclick"));
            nlcResultItem.catelog = evalJS(item.select("div.info div.moreNav a#catelog").attr("onclick"));
            nlcResultItem.collection = evalJS(item.select("div.info div.moreNav a#collection").attr("onclick"));
            nlcResult.addResultItem(nlcResultItem);
        }

        final int PAGE_SIZE = 10;
        Elements totalCountElements = doc.select("b#totalCnt");
        Element totalCountElement = totalCountElements != null ? totalCountElements.first() : null;
        int totalCount = totalCountElement != null ? Integer.valueOf(totalCountElement.text()) : 0;
        nlcResult.setTotalCount(totalCount);
        Elements pages = items.select("div.page");
        Element page = pages != null ? pages.first() : null;
        if (totalCount > 0 && page != null) {
            Elements currentPageNos = page.select("span.current");
            Element currentPageNo = currentPageNos != null ? currentPageNos.first() : null;
            nlcResult.setCurrentPageNo(currentPageNo != null ? Integer.valueOf(currentPageNo.text()) : 0);
        }
    }

    public static String getSearchURL(SearchCriteria searchCriteria) throws UnsupportedEncodingException {
        StringBuilder sb = new StringBuilder();
        boolean first = true;
        for (Map.Entry<String, String> entry : searchCriteria.getParams().entrySet()) {
            if (!first) {
                sb.append('&');
            } else {
                first = false;
            }
            sb.append(URLEncoder.encode(entry.getKey(), "UTF-8")).append('=').append(URLEncoder.encode(entry.getValue(), "UTF-8"));
        }

        return (searchCriteria.pageNo > 1 ? URL_AJAX : URL) + "?" + sb.toString();
    }

    /**
     * Evaluate javascript method
     *
     * makeDetailUrl(this, '/search/showDocDetails?', '1759949349318496421', 'ucs01', '7-121-02298-2');
     * toggleDocExpandInfo(this, 'catelog', '1759949349318496421', '馆藏中文资源', 'ucs01')
     *
     * @param js
     * @return
     */
    private static String evalJS(String js) throws NLCException {
        int pos = js.indexOf('(');
        int rpos = js.lastIndexOf(')');

        if (pos == -1 || rpos == -1) {
            return js;
        }

        String func = js.substring(0, pos);
        String paramStr = js.substring(pos + 1, rpos);
        String[] params = paramStr.split(",");
        try {
            if (params.length == 5 && func.equals("makeDetailUrl")) {
                return makeDetailUrl(trimQuota(params[1]), trimQuota(params[2]), trimQuota(params[3]), trimQuota(params[4]));
            } else if (params.length == 5 && func.equals("toggleDocExpandInfo")) {
                return toggleDocExpandInfo(trimQuota(params[1]), trimQuota(params[2]), trimQuota(params[3]), trimQuota(params[4]));
            }
        } catch (UnsupportedEncodingException e) {
            throw new NLCException(e);
        }

        return js;
    }

    private static String trimQuota(String val) {
        String newVal = val.trim();
        if ((newVal.length() >= 2) && (((newVal.charAt(0) == '\'') && (newVal.charAt(newVal.length() - 1) == '\'')) ||
                ((newVal.charAt(0) == '\"') && (newVal.charAt(newVal.length() - 1) == '\"')))) {
            return newVal.substring(1, newVal.length() - 1);
        }

        return newVal;
    }

    private static String makeDetailUrl(String url, String docId, String dataSource, String query) throws UnsupportedEncodingException {
        return String.format("%s%sdocId=%s&dataSource=%s&query=%s", URL_BASE, url, docId, dataSource, URLEncoder.encode(query, "UTF-8"));
    }

    private static String toggleDocExpandInfo(String tab, String docId, String dataSource, String dataSourceEn) throws UnsupportedEncodingException {
        return String.format("%s%sdocId=%s&dataSource=%s", URL_BASE, "/search/showExpandInfo?", docId, dataSourceEn);
    }
}
【热门文章】
【热门文章】