利用jsoup获取网页内容,然后对内容进行分析
# 一、maven
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
1
2
3
4
5
6
7
2
3
4
5
6
7
# 二、使用
package cn.lisynet;
import okhttp3.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class Baidu {
private static final MediaType MEDIA_TYPE_MARKDOWN
= MediaType.parse("Content-Type: text/plain");
private static Logger logger = LogManager.getLogger(Baidu.class);
private static final OkHttpClient client = new OkHttpClient();
public static void submit() throws IOException {
List<String> links = new ArrayList<>();
int index = 1;
boolean isEnd = false;
while (!isEnd){
Document doc = Jsoup.connect("http://127.0.0.1:90/page/"+index++).timeout(30000).get();
//Document doc = Jsoup.parse(html);
Elements content = doc.select(".post-preview a");
for (Element link : content) {
String linkHref = link.attr("href");
links.add(linkHref);
}
Elements contentEnd = doc.select(".next a");
if(contentEnd==null||contentEnd.size()==0){
isEnd = true;
}
}
StringBuilder paramString = new StringBuilder();
for (String link:
links) {
paramString.append(link).append("\n");
}
//展示所有的链接
System.out.println(paramString );
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
http://127.0.0.1:90/page/1 返回的html形式:
<html>
<header>
<title>测试</title>
</header>
<body>
<div class="post-preview">
<a href="https://127.0.0.1:90/archives/rrrr">
<h2 class="post-title">
标题1
</h2>
<div class="post-content-preview">
摘要1
</div>
</a>
<p class="post-meta">
信息1
</p>
</div>
<hr>
<div class="post-preview">
<a href="https://127.0.0.1:90/archives/rrrr4">
<h2 class="post-title">
标题4
</h2>
<div class="post-content-preview">
摘要4
</div>
</a>
<p class="post-meta">
信息4
</p>
</div>
<hr>
<div class="post-preview">
<a href="https://127.0.0.1:90/archives/rrrr2">
<h2 class="post-title">
标题2
</h2>
<div class="post-content-preview">
摘要2
</div>
</a>
<p class="post-meta">
信息2
</p>
</div>
<hr>
<div class="post-preview">
<a href="https://127.0.0.1:90/archives/rrrr3">
<h2 class="post-title">
标题3
</h2>
<div class="post-content-preview">
摘要3
</div>
</a>
<p class="post-meta">
信息3
</p>
</div>
<hr>
<ul class="pager">
<li class="next">
<a href="https://127.0.0.1:90/page/2">Older Posts →</a>
</li>
</ul>
</body>
</html>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
上次更新: 2024/01/07, 07:44:52