Java爬虫获取企业工商信息

Jsoup请求网址数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/**
* 请求网址数据
*/
public static Document getUrlDocument(String url) {
try {
// 此处自己处理异常、其他参数等
Document doc = Jsoup.connect(url).timeout(3000).get();

if (doc != null) {
return doc;
}
} catch (IOException e) {
logger.error(e.getMessage());
}

return null;
}

使用隧道代理

大量调用一些网站会被封ip,可以使用阿布云隧道代理,参照阿布云官网

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
public class JsoupUtils {

// 阿布云通行证书和通行密钥,注册地址:https://www.abuyun.com
private final static String ProxyUser = "XXXXXXXXXXXXXXXX";
private final static String ProxyPass = "XXXXXXXXXXXXXXXX";

// 代理服务器
private final static String ProxyHost = "http-dyn.abuyun.com";
private final static Integer ProxyPort = 9020;

/**
* 使用阿布云隧道代理请求网址数据
*/
public static Document getUrlProxyDocument(String url) {
Authenticator.setDefault(new Authenticator() {
public PasswordAuthentication getPasswordAuthentication() {
return new PasswordAuthentication(ProxyUser, ProxyPass.toCharArray());
}
});

Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ProxyHost, ProxyPort));

try {
// 此处自己处理异常、其他参数等
Document doc = Jsoup.connect(url).timeout(3000).proxy(proxy).get();

if (doc != null) {
return doc;
}
} catch (IOException e) {
logger.error(e.getMessage());
}

return null;
}

}

根据公司名称获取工商信息并解析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/**
* 根据公司名称获取工商信息并解析
*/
public static AccountBookInfo getInfo(String bookName) {
AccountBookInfo result = new AccountBookInfo();
String encodeName;
try {
encodeName = URLEncoder.encode(bookName, "utf-8");
} catch (UnsupportedEncodingException e) {
logger.error("encode error!");
return null;
}
Document doc = JsoupUtils.getUrlDocument("http://m.qixin.com/search/" + encodeName + ".html");
if (doc == null) {
logger.error("first page get error!");
return null;
}
// 获取详情页地址(列表的第一个公司)
Elements rows = doc.select(".new-search-result a");
if (rows.size() == 0) {
logger.error("html match error!");
return null;
}
Element row = rows.get(0);
Document infoDoc = JsoupUtils.getUrlDocument("http://m.qixin.com" + row.attr("href") + "/info");
if (infoDoc == null) {
logger.error("second page get error!");
return null;
}
Elements headInfo = infoDoc.select(".container-fluid > div:eq(1)");
result.accountBookName = headInfo.select("> div:eq(0)").text();
result.legalPerson = headInfo.select("> div:eq(1) > div > a").text();
result.registeredCapital = decode(headInfo.select("> div:eq(2) > span").text());
// 需要解码
result.establishDate = decode(headInfo.select("> div:eq(3) > span").text());
Elements detailInfo = infoDoc.select(".particular-information > .padding-b-1-3x");
result.registeredNo = detailInfo.select("> div:eq(0) > div > div:eq(0) > div:eq(1)").text();
result.organizationCode = detailInfo.select("> div:eq(0) > div > div:eq(1) > div:eq(1)").text();
result.socialCreditCode = detailInfo.select("> div:eq(1) > div > div:eq(0) > div:eq(1)").text();
result.businessStatus = detailInfo.select("> div:eq(1) > div > div:eq(1) > div:eq(1)").text();
result.companyType = detailInfo.select("> div:eq(2) > div:eq(1)").text();
result.businessScope = detailInfo.select("> div:eq(3) > div:eq(1)").text();
result.address = detailInfo.select("> div:eq(4) > div:eq(1)").text();
result.businessPeriod = detailInfo.select("> div:eq(5) > div:eq(1)").text();
return result;
}
  • 输入公司名称或缩写,例如阿里巴巴
  • 这里使用JsoupJquery选择器的语法获取对应的信息,更多的解析方式见:Jsoup官网

实体

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
public class AccountBookInfo {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
public Long id;
public String accountBookName; //公司名称
public String taxPayerCode; //纳税人识别号
public String taxPayerId; //纳税人身份证号
public Integer nationalLoginWay; //国税登录方式
public String nationalPwd; //国税密码
public Integer landLoginWay; //地税登录方式
public String landPwd; //地税密码
public String legalPerson; //法人
public String registeredCapital; //注册资本
public String registeredNo; //注册号
public String organizationCode; //组织机构代码
public String socialCreditCode; //社会信用代码
public String establishDate; //成立日期
public String companyType; //公司类型
public String businessScope; //经营范围
public String businessStatus; //经营状态
public String address; //地址
public String businessPeriod; //营业期限
}

日期解码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/**
* 日期格式解码
*/
private static String decode(String date) {
char[] charArray = date.toCharArray();
for (int i = 0; i < charArray.length; i++) {
switch (charArray[i]) {
case '0':
charArray[i] = '4';
break;
case '1':
charArray[i] = '0';
break;
case '2':
charArray[i] = '8';
break;
case '3':
charArray[i] = '7';
break;
case '4':
charArray[i] = '9';
break;
case '5':
charArray[i] = '3';
break;
case '6':
charArray[i] = '2';
break;
case '7':
charArray[i] = '5';
break;
case '8':
charArray[i] = '6';
break;
case '9':
charArray[i] = '1';
break;
default:
break;
}
}
return new String(charArray);
}
文章目录
  1. 1. Jsoup请求网址数据
  2. 2. 使用隧道代理
  3. 3. 根据公司名称获取工商信息并解析
  4. 4. 实体
  5. 5. 日期解码