如何根据输入商品名如书名,获取具体的出品公司,出版社。从京东,当当上可以拉取吗。豆瓣那么多数据是怎么哪里拉取的?
到www.nlc.gov.cn国家图书馆或者类似的网站查询ISBN, 然后你用程序实现这个动作就好了。
因为我一直都比较喜欢看书,所以我希望自己能够开发一个小工具,自动的帮我收集书籍的信息。但是一直没能找到很好的数据源。
根据 @Kslr 的提示,从国家图书馆抓取我要的信息,我用Jsoup
写了一份代码,能够抓取到图书的基本信息了。
豆瓣
开放了自己的API
,我们可以直接使用他们的SDK
图书API来查询相关信息的。
当当的也有一个API,京东现在也在搞云计算,所以也都是有自己的SDK的。需要我们学习一下。
下面是例子,你可以参考参考。如果我们有共同目标,你还可以联系我,我们可以讨论讨论。
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Map;
/**
* Created by pingjiang on 14-6-22.
*
*/
public class NLCSearcher {
private static final String URL_BASE = "http://find.nlc.gov.cn";
private static final String URL = URL_BASE + "/search/doSearch";
private static final String URL_AJAX = URL_BASE + "/search/ajaxSearch";
static String[] TARGET_FIELDS_LOGS = new String[] { "全部字段", "题名", "责任者", "关键词", "出版商", "ISBN" };
static String[] TARGET_FIELDS = new String[] { "", "alltitle", "allcreator", "keywords", "publisher", "identifer" };
public void search(NLCResult nlcResult) throws NLCException {
search(nlcResult.getSearchCriteria(), nlcResult);
}
public void search(SearchCriteria searchCriteria, NLCResult nlcResult) throws NLCException {
try {
parse(Jsoup.parse(new URL(searchCriteria.buildURL()), 10000), nlcResult);
} catch (IOException e) {
throw new NLCException(e);
}
}
private void parse(Document doc, NLCResult nlcResult) throws NLCException {
Elements elements = doc.select("div#searchresult-items");
if (elements == null || elements.size() == 0) {
throw new NLCException("div#searchresult-items element is not found");
}
Element element = elements.first();
Elements items = element.select("div.item");
for (Element item : items) {
NLCResultItem nlcResultItem = new NLCResultItem();
nlcResultItem.url = evalJS(item.select("div.info h4 a").attr("onclick"));
nlcResultItem.imageURL = evalJS(item.select("div.img a").attr("onclick"));
nlcResultItem.title = item.select("div.info h4 a").text();
Elements infoElements = item.select("div.info p");
if (infoElements.size() == 4) {
nlcResultItem.type = infoElements.get(0).select("em").first().text();
nlcResultItem.author = infoElements.get(1).select("em").first().text();
nlcResultItem.publishYear = Integer.valueOf(infoElements.get(2).select("span.pubDate em").text());
nlcResultItem.publisher = infoElements.get(2).select("span.pub em").text();
nlcResultItem.dataSource = infoElements.get(3).select("em").first().text();
}
nlcResultItem.summary = evalJS(item.select("div.info div.moreNav a#summary").attr("onclick"));
nlcResultItem.catelog = evalJS(item.select("div.info div.moreNav a#catelog").attr("onclick"));
nlcResultItem.collection = evalJS(item.select("div.info div.moreNav a#collection").attr("onclick"));
nlcResult.addResultItem(nlcResultItem);
}
final int PAGE_SIZE = 10;
Elements totalCountElements = doc.select("b#totalCnt");
Element totalCountElement = totalCountElements != null ? totalCountElements.first() : null;
int totalCount = totalCountElement != null ? Integer.valueOf(totalCountElement.text()) : 0;
nlcResult.setTotalCount(totalCount);
Elements pages = items.select("div.page");
Element page = pages != null ? pages.first() : null;
if (totalCount > 0 && page != null) {
Elements currentPageNos = page.select("span.current");
Element currentPageNo = currentPageNos != null ? currentPageNos.first() : null;
nlcResult.setCurrentPageNo(currentPageNo != null ? Integer.valueOf(currentPageNo.text()) : 0);
}
}
public static String getSearchURL(SearchCriteria searchCriteria) throws UnsupportedEncodingException {
StringBuilder sb = new StringBuilder();
boolean first = true;
for (Map.Entry<String, String> entry : searchCriteria.getParams().entrySet()) {
if (!first) {
sb.append('&');
} else {
first = false;
}
sb.append(URLEncoder.encode(entry.getKey(), "UTF-8")).append('=').append(URLEncoder.encode(entry.getValue(), "UTF-8"));
}
return (searchCriteria.pageNo > 1 ? URL_AJAX : URL) + "?" + sb.toString();
}
/**
* Evaluate javascript method
*
* makeDetailUrl(this, '/search/showDocDetails?', '1759949349318496421', 'ucs01', '7-121-02298-2');
* toggleDocExpandInfo(this, 'catelog', '1759949349318496421', '馆藏中文资源', 'ucs01')
*
* @param js
* @return
*/
private static String evalJS(String js) throws NLCException {
int pos = js.indexOf('(');
int rpos = js.lastIndexOf(')');
if (pos == -1 || rpos == -1) {
return js;
}
String func = js.substring(0, pos);
String paramStr = js.substring(pos + 1, rpos);
String[] params = paramStr.split(",");
try {
if (params.length == 5 && func.equals("makeDetailUrl")) {
return makeDetailUrl(trimQuota(params[1]), trimQuota(params[2]), trimQuota(params[3]), trimQuota(params[4]));
} else if (params.length == 5 && func.equals("toggleDocExpandInfo")) {
return toggleDocExpandInfo(trimQuota(params[1]), trimQuota(params[2]), trimQuota(params[3]), trimQuota(params[4]));
}
} catch (UnsupportedEncodingException e) {
throw new NLCException(e);
}
return js;
}
private static String trimQuota(String val) {
String newVal = val.trim();
if ((newVal.length() >= 2) && (((newVal.charAt(0) == '\'') && (newVal.charAt(newVal.length() - 1) == '\'')) ||
((newVal.charAt(0) == '\"') && (newVal.charAt(newVal.length() - 1) == '\"')))) {
return newVal.substring(1, newVal.length() - 1);
}
return newVal;
}
private static String makeDetailUrl(String url, String docId, String dataSource, String query) throws UnsupportedEncodingException {
return String.format("%s%sdocId=%s&dataSource=%s&query=%s", URL_BASE, url, docId, dataSource, URLEncoder.encode(query, "UTF-8"));
}
private static String toggleDocExpandInfo(String tab, String docId, String dataSource, String dataSourceEn) throws UnsupportedEncodingException {
return String.format("%s%sdocId=%s&dataSource=%s", URL_BASE, "/search/showExpandInfo?", docId, dataSourceEn);
}
}