Jsoup请求网址数据 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 public static Document getUrlDocument (String url) { try { Document doc = Jsoup.connect(url).timeout(3000 ).get(); if (doc != null ) { return doc; } } catch (IOException e) { logger.error(e.getMessage()); } return null ; }
使用隧道代理
大量调用一些网站会被封ip,可以使用阿布云隧道代理,参照阿布云官网 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 public class JsoupUtils { private final static String ProxyUser = "XXXXXXXXXXXXXXXX" ; private final static String ProxyPass = "XXXXXXXXXXXXXXXX" ; private final static String ProxyHost = "http-dyn.abuyun.com" ; private final static Integer ProxyPort = 9020 ; public static Document getUrlProxyDocument (String url) { Authenticator.setDefault(new Authenticator() { public PasswordAuthentication getPasswordAuthentication () { return new PasswordAuthentication(ProxyUser, ProxyPass.toCharArray()); } }); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ProxyHost, ProxyPort)); try { Document doc = Jsoup.connect(url).timeout(3000 ).proxy(proxy).get(); if (doc != null ) { return doc; } } catch (IOException e) { logger.error(e.getMessage()); } return null ; } }
根据公司名称获取工商信息并解析 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 public static AccountBookInfo getInfo (String bookName) { AccountBookInfo result = new AccountBookInfo(); String encodeName; try { encodeName = URLEncoder.encode(bookName, "utf-8" ); } catch (UnsupportedEncodingException e) { logger.error("encode error!" ); return null ; } Document doc = JsoupUtils.getUrlDocument("http://m.qixin.com/search/" + encodeName + ".html" ); if (doc == null ) { logger.error("first page get error!" ); return null ; } Elements rows = doc.select(".new-search-result a" ); if (rows.size() == 0 ) { logger.error("html match error!" ); return null ; } Element row = rows.get(0 ); Document infoDoc = JsoupUtils.getUrlDocument("http://m.qixin.com" + row.attr("href" ) + "/info" ); if (infoDoc == null ) { logger.error("second page get error!" ); return null ; } Elements headInfo = infoDoc.select(".container-fluid > div:eq(1)" ); result.accountBookName = headInfo.select("> div:eq(0)" ).text(); result.legalPerson = headInfo.select("> div:eq(1) > div > a" ).text(); result.registeredCapital = decode(headInfo.select("> div:eq(2) > span" ).text()); result.establishDate = decode(headInfo.select("> div:eq(3) > span" ).text()); Elements detailInfo = infoDoc.select(".particular-information > .padding-b-1-3x" ); result.registeredNo = detailInfo.select("> div:eq(0) > div > div:eq(0) > div:eq(1)" ).text(); result.organizationCode = detailInfo.select("> div:eq(0) > div > div:eq(1) > div:eq(1)" ).text(); result.socialCreditCode = detailInfo.select("> div:eq(1) > div > div:eq(0) > div:eq(1)" ).text(); result.businessStatus = detailInfo.select("> div:eq(1) > div > div:eq(1) > div:eq(1)" ).text(); result.companyType = detailInfo.select("> div:eq(2) > div:eq(1)" ).text(); result.businessScope = detailInfo.select("> div:eq(3) > div:eq(1)" ).text(); result.address = detailInfo.select("> div:eq(4) > div:eq(1)" ).text(); result.businessPeriod = detailInfo.select("> div:eq(5) > div:eq(1)" ).text(); return result; }
输入公司名称或缩写,例如阿里巴巴
这里使用Jsoup
中Jquery
选择器的语法获取对应的信息,更多的解析方式见:Jsoup官网
实体 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 public class AccountBookInfo { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) public Long id; public String accountBookName; public String taxPayerCode; public String taxPayerId; public Integer nationalLoginWay; public String nationalPwd; public Integer landLoginWay; public String landPwd; public String legalPerson; public String registeredCapital; public String registeredNo; public String organizationCode; public String socialCreditCode; public String establishDate; public String companyType; public String businessScope; public String businessStatus; public String address; public String businessPeriod; }
日期解码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 private static String decode (String date) { char [] charArray = date.toCharArray(); for (int i = 0 ; i < charArray.length; i++) { switch (charArray[i]) { case '0' : charArray[i] = '4' ; break ; case '1' : charArray[i] = '0' ; break ; case '2' : charArray[i] = '8' ; break ; case '3' : charArray[i] = '7' ; break ; case '4' : charArray[i] = '9' ; break ; case '5' : charArray[i] = '3' ; break ; case '6' : charArray[i] = '2' ; break ; case '7' : charArray[i] = '5' ; break ; case '8' : charArray[i] = '6' ; break ; case '9' : charArray[i] = '1' ; break ; default : break ; } } return new String(charArray); }