Java爬虫

实现目标:

1、要求爬到该站点的所有小说,(且具有定时去查看小说更新的功能)
2、爬到的内容要求包含简介、作者名称、封面图片、小说名称、具体小说章节、最新更新章节、完结状态
3、上述要求内容还必须存到数据库中
4、小说内容应该以怎么样的形式存到数据库中

实现方法

1、框架:seimicrawler、jsoup、WebMagic(决定采用jsoup)

2、采用selenium这类谷歌测试工具来爬取动态加载的网页

python思路

先摆上python的静态爬虫提供相似思路(使用Xpath的方法找到标签)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
url = 'https://www.zwwx.com/book/67/67510/'
# header
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/'
'537.36 Edg/103.0.1264.49'} # 伪装headers
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding # 转中文码(自动分析解码)
# re的正则表达式的方法
# print(response.text)
href = re.findall('<dd><a href="(.*?)">.*?</a></dd>', response.text) # 需要的用(.*?)代替,不需要的用.*?代替
name = re.findall('<dt>(.*?)</dt>', response.text)[0] # 同上,而[i] i代表第几个数据,从第0个开始

for index in href:
index_url = 'https://www.zwwx.com' + index # 循环有点意思,实际上变量index就是href中的每一个链接
res = requests.get(url=index_url, headers=headers)
res.encoding = res.apparent_encoding
# 有标签时用css和xpath(无法直接提取字符串数据,需要加.text),无标签用re正则(无需加.text)
selector = parsel.Selector(res.text)
title = selector.css('.bookname > h1::text').get()
# 复杂版:#wrapper > div.content_read > div > div.bookname > h1::text
# h1::text是提取标签内的文本,.get获取内容
content_list = selector.css('#content::text').getall()
content = '\n'.join(content_list) # 列表转为字符串
# .get()是取一个 .getall()是提取全部!!!!!!!!!!提取出来是列表,而非字符串
with open(f'novel\\{name}.txt', mode='a', encoding='utf-8') as f: # as f是将这个文件命名为f
# 配置文件 文件路径 名字 保存方式(mode w写入数据(会覆盖) a追加保存 ) 编码格式
# open()需要末尾加上f.close,而with open()则会自动关闭不需要加f.close
f.write(title) # 写入f文件中
f.write('\n')
f.write(content)
f.write('\n')
time.sleep(0.5)
print('正在保存:', title) # 直接文字需双引号,变量不用,用 , 连接

动态爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
opt = Options()
opt.add_argument('--headless')
opt.add_argument('--disable-gpu')
driver = Chrome(options=opt)
url = 'https://www.maofly.com/manga/6996/451102.html'

time.sleep(3) # 这是为了让网页能够完全加载出来

driver.get(url=url)

for i in range(1, 10000):
result = es.alert_is_present()(driver)

if result:
print(result.text)
result.accept()
time.sleep(3)
i = 1
else:
print('没有警告窗')
res = driver.page_source
selector = parsel.Selector(res)
# //*[@id="all"]/div/div[2]/div[1]/img
img_url = selector.xpath('//*[@id="all"]/div/div[2]/div[1]/img/@src').get()
pic_title = selector.xpath('/html/body/div/h2/text()').get()
pic_name = selector.xpath('/html/body/div/h1/a/text()').get()
if not os.path.exists('img\\' + pic_name): # 如果该路径下没有这个文件夹,那就创建这个文件夹
os.mkdir('img\\' + pic_name)
img = requests.get(url=img_url).content
name = pic_title, i # 不能用pic_title + i
print('正在保存:', name, img_url) # 直接文字需双引号,变量不用,用 , 连接

with open(f'img\\{pic_name}\\{name}.jpg', mode='wb') as f: # as f是将这个文件命名为f,变量需要加进{ }\\内,定值直接输入\\
# 配置文件 文件路径 名字 保存方式(mode w写入数据(会覆盖) a追加保存 ) 编码格式
# open()需要末尾加上f.close,而with open()则会自动关闭不需要加f.close
f.write(img) # 写入f文件中
print('保存完成:', name) # 直接文字需双引号,变量不用,用 , 连接

button = driver.find_element(By.XPATH, '/html/body/div/div[2]/nav/div/a[4]')
button.click()

在爬取数据方面我觉得在看完狂神的视频后问题不大,最大的问题是数据库写入的实现,以及不定期爬取更新资源的实现(可以等网站测试做完再说)

Jsoup

爬取小说地址具体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
public List<String> getBookUrl1() throws IOException, InterruptedException {



ArrayList<String> bookUrlList1 = new ArrayList<>();
//count用于计数完成了多少BookUrl的获取
int count=0;
//一共有5页
int max = 5;
for (int i=1;i<=max;i++){
String urlEver = "https://www.9biqu.com/class/1/"+i+".html";


try {
Proxy currentProxy = proxies.get(currentProxyIndex);
Document document = Jsoup.connect(urlEver)
.userAgent(ua)
.proxy(currentProxy)
.timeout(1000000000)
.get();
Thread.sleep(3000);
Elements NameUrlList = document.select("#newscontent > div.update-list > div > div > ul>li");
for (Element el : NameUrlList){
String BookUrlPart = el.select("span.s2 > a").attr("href");
String BookUrl ="https://www.9biqu.com"+ BookUrlPart;
bookUrlList1.add(BookUrl);
count++;
System.out.println("玄幻小说地址采集完成"+count+"个");
}
} catch (SocketException e) {
System.out.println("IP地址被禁止,等待10秒钟...");
Thread.sleep(10000);
i--;
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}catch (IOException e){
System.out.println("在爬取小说地址时,IP地址失效,将更换IP...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}
}
System.out.println("玄幻小说已经全部采集完成");


return bookUrlList1;
}

爬取小说信息具体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
public List<BookInfo> listBookDetails() throws IOException, InterruptedException {

int count = 0;
BUrl1 = getBookUrl1();
for (int i=0;i< BUrl1.size();i++){
String urlEverBook=BUrl1.get(i);
try {

count++;
Proxy currentProxy = proxies.get(currentProxyIndex);
Document document = Jsoup.connect(urlEverBook)
.userAgent(ua)
// 将代理服务器对象传递给Jsoup
.proxy(currentProxy)
.timeout(1000000000)
.get();

String bookName = document.select("#info > h1").text();
String authorName = document.select("#info > p:nth-child(2) > a").text();

String picUrl = "https://www.9biqu.com" + document.select("#fmimg > img").attr("src");
String lastChapterName = document.select("#info > p:nth-child(5) > a").text();

if(authorName.equals("小说免费阅读")){
authorName="Tec";
}
String bookIntro = document.select("#intro").text();

System.out.println("第"+count+"本书名:"+bookName);
System.out.println("第"+count+"本作者名:"+authorName);
System.out.println("第"+count+"本介绍:"+bookIntro);
System.out.println("第"+count+"本图片链接:"+picUrl);
System.out.println("第"+count+"本最新章名:"+lastChapterName);



QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper<>();
BookInfoNameQueryWrapper.eq("book_name", bookName);
BookInfo bookInfo = bookInfoMapper.selectOne(BookInfoNameQueryWrapper);

if(bookInfo==null){
BookInfo bookInfo1 = new BookInfo();
bookInfo1.setWorkDirection(0);
bookInfo1.setCategoryId(1L);
bookInfo1.setCategoryName("玄幻奇幻");
bookInfo1.setPicUrl(picUrl);
bookInfo1.setBookName(bookName);
bookInfo1.setAuthorId(0L);
bookInfo1.setAuthorName(authorName);
bookInfo1.setBookDesc(bookIntro);
bookInfo1.setScore(6);
bookInfo1.setBookStatus(0);
bookInfo1.setVisitCount(100L);
//bookInfo1.setWordCount();
bookInfo1.setCommentCount(0);
//bookInfo1.setLastChapterId();
bookInfo1.setLastChapterName(lastChapterName);
bookInfo1.setLastChapterUpdateTime(LocalDateTime.now());
bookInfo1.setCreateTime(LocalDateTime.now());
bookInfo1.setUpdateTime(LocalDateTime.now());
bookInfo1.setIsVip(0);

bookInfoMapper.insert(bookInfo1);
System.out.println("数据表bookInfo数据存入成功");
}else{
System.out.println("在数据表bookInfo中书名为"+bookName+"的小说在数据库中已经存在");
}



Thread.sleep(4000);
} catch (SocketException e) {
System.out.println("在爬取小说信息时,IP地址被禁止,等待10秒钟...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}catch (IOException e){
System.out.println("在爬取小说信息时,IP地址失效,将更换IP...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}

}
return null;
}

爬取小说章节内容具体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
public String getBookContent(String BookContentUrl) throws IOException, InterruptedException {

String bookContent = null;
while (bookContent == null){
try {
Proxy currentProxy = proxies.get(currentProxyIndex);
Document document = Jsoup.connect(BookContentUrl)
.userAgent(ua)
.proxy(currentProxy)
.timeout(1000000000)
.get();
Thread.sleep(4000);
Elements bookContentList = document.select("#content > *:not(p:first-child)");

bookContent = bookContentList.toString();

}catch (SocketException e){
System.out.println("在爬取小说章节内容时,IP地址被禁止,等待10秒钟...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}catch (IOException e){
System.out.println("在爬取小说章节内容时,IP地址失效,将更换IP...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}
}
return bookContent;
}
/**统记中文字数
*
* */
public int countChineseCharacters(String bookContent) {
if (bookContent == null || bookContent.trim().isEmpty()) {
return 0;
}

int count = 0;
String[] words = bookContent.trim().split("\\s+");
for (String word : words) {
for (char c : word.toCharArray()) {
if (isChineseCharacter(c)) {
count++;
}
}
}
return count;
}

/**判断是否是中文
*
* */
public boolean isChineseCharacter(char c) {
Character.UnicodeScript script = Character.UnicodeScript.of(c);
return script == Character.UnicodeScript.HAN;
}

爬取小说章节具体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
public List<BookInfo> listBookContent() throws IOException, InterruptedException {

int count = 0;
int countChapter = 0;
BUrl1 = getBookUrl1();
for (int i=0;i< BUrl1.size();i++){
String urlEverBook=BUrl1.get(i);
try {
count++;
Proxy currentProxy = proxies.get(currentProxyIndex);
Document document = Jsoup.connect(urlEverBook)
.userAgent(ua)
// 将代理服务器对象传递给Jsoup
.proxy(currentProxy)
.timeout(1000000000)
.get();

String bookName = document.select("#info > h1").text();

//以下代码是取得每本书的每个章节名以及链接还有章节内容
String bookChapterName;
String bookChapterUrlPart;
String bookChapterUrl;

String bookContent;

Long bookId= 0L;



Elements bookChapter = document.select("#list > dl > dd:gt(13)");
for(Element el:bookChapter){
countChapter++;
bookChapterName = el.select("a").text();
bookChapterUrlPart = el.select("a").attr("href");
bookChapterUrl = "https://www.9biqu.com"+ bookChapterUrlPart;
bookContent = getBookContent(bookChapterUrl);
int chineseCharCount = countChineseCharacters(bookContent);

System.out.println("第"+count+"本的第"+countChapter+"章名"+bookChapterName);
System.out.println("第"+count+"本的第"+countChapter+"章链接:"+bookChapterUrl);
//System.out.println("第"+count+"本的第"+countChapter+"章内容:"+bookContent);

//查找是否有该小说的存在
QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper<>();
BookInfoNameQueryWrapper.eq("book_name", bookName);
BookInfo bookInfoText = bookInfoMapper.selectOne(BookInfoNameQueryWrapper);


BookInfo bookInfoId = bookInfoMapper.selectOne(BookInfoNameQueryWrapper);
bookId = bookInfoId.getId();

//查找是否有重复章节内容的存在
QueryWrapper<BookChapter>BookChapterTextQueryWrapper = new QueryWrapper<>();
BookChapterTextQueryWrapper.eq("chapter_name",bookChapterName);
BookChapterTextQueryWrapper.eq("book_id",bookId);
BookChapter bookChapterText = bookChapterMapper.selectOne(BookChapterTextQueryWrapper);



// 如果不存在相同的章节名字,则插入书籍信息
if(bookInfoText != null){
if (bookChapterText == null ) {


//将数据存入bookChapter
BookChapter bookChapter1 = new BookChapter();
bookChapter1.setBookId(bookId);
bookChapter1.setChapterNum(countChapter);
bookChapter1.setChapterName(bookChapterName);
bookChapter1.setWordCount(chineseCharCount);
bookChapter1.setIsVip(0);
bookChapter1.setCreateTime(LocalDateTime.now());
bookChapter1.setUpdateTime(LocalDateTime.now());

bookChapterMapper.insert(bookChapter1);

System.out.println("数据表bookChapter数据存入成功");

} else {
// 如果已经存在相同的书籍信息,则直接使用已存在的书籍信息的id
System.out.println("在数据表bookChapter中章节名为"+bookChapterName+"的小说"+bookName+"在数据库中已经存在");
}
}else {
System.out.println("在数据表bookChapter中书名为"+bookName+"的小说在数据库中不存在");
}


Long chapterId = 0L;

// 查询是否存在该章节和该书名
QueryWrapper<BookChapter> bookChapterQueryWrapper = new QueryWrapper<>();
bookChapterQueryWrapper.eq("book_id", bookId);
bookChapterQueryWrapper.eq("chapter_name", bookChapterName);
BookChapter bookChapterText2 = bookChapterMapper.selectOne(bookChapterQueryWrapper);

// 如果章节存在,则获取其 ID
if ( bookChapterText2 != null) {
chapterId = bookChapterText2.getId();
}

QueryWrapper<BookContent>BookContentQueryWrapper =new QueryWrapper<>();
BookContentQueryWrapper.eq("chapter_id",chapterId);
BookContent bookContentText = bookContentMapper.selectOne(BookContentQueryWrapper);


// 如果不存在相同的章节内容,则插入书籍信息
if (bookContentText==null && bookInfoText !=null){
BookContent bookContent1 = new BookContent();
bookContent1.setChapterId(chapterId);
bookContent1.setContent(bookContent);
bookContent1.setCreateTime(LocalDateTime.now());
bookContent1.setUpdateTime(LocalDateTime.now());

bookContentMapper.insert(bookContent1);

System.out.println("数据表bookContent数据存入成功");
}else {
// 如果已经存在相同的书籍信息,则直接使用已存在的书籍信息的id
System.out.println("在数据表bookContent中书名为"+bookName+"的小说在数据库中已经存在");
}

}
Thread.sleep(4000);
} catch (SocketException e) {
System.out.println("在爬取小说章节时,IP地址被禁止,等待10秒钟...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}catch (IOException e){
System.out.println("在爬取小说章节时,IP地址失效,将更换IP...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}

}
return null;
}

相关知识点

解析html的两种方式

1、Jsoup.parse

1
Document document= Jsoup.parse(new URL(urlEver), 300000);

2、Jsoup.connect

userAgent更换访问头

proxy更换IP

timeout防止超时异常

1
2
3
4
5
Document document = Jsoup.connect(urlEverBook)
.userAgent(ua)
.proxy(currentProxy)
.timeout(1000000000)
.get();

获取元素的方式

DOM解析

.getElementsByTag(“h1”)——标签名

getElementById(“intro”)——id名

.text()获取文字

.attr(“href”)获取元素的某个属性

1
2
String bookName = document.getElementsByTag("h1").get(0).text();
String src = el.getElementsByTag("a").eq(0).attr("href");
CSS选择器

.select(”#info > h1”)——selector地址

.text()获取文字

.attr(“href”)获取元素的某个属性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//取单条数据:
String bookName = document.select("#info > h1").text();

//取多条数据:
//注意这个select取得的位置一定要看清楚
//select("span.s2 > a")
Elements NameUrlList = document.select("#newscontent > div.update-list > div > div > ul>li");
for (Element el : NameUrlList){
String BookUrlPart = el.select("span.s2 > a").attr("href");
String BookUrl ="https://www.9biqu.com"+ BookUrlPart;
bookUrlList1.add(BookUrl);
count++;
System.out.println("小说地址采集完成"+count+"个");
}

//按照条件取多条数据:
//从第13个数据开始取
//#list > dl > dd:gt(13)
Elements bookChapter = document.select("#list > dl > dd:gt(13)");
for(Element el:bookChapter){
countChapter++;
bookChapterName = el.select("a").text();
bookChapterUrlPart = el.select("a").attr("href");
bookChapterUrl = "https://www.9biqu.com"+ bookChapterUrlPart;

System.out.println("第"+count+"本的第"+countChapter+"章名"+bookChapterName);
System.out.println("第"+count+"本的第"+countChapter+"章链接"+bookChapterUrl);


chapterNameList.add(bookChapterName);
chapterUrlList.add(bookChapterUrl);
}
//选择除了第一个以外的所有数据:
public String getBookContent(String BookContentUrl) throws IOException, InterruptedException {

String bookContent;
Document document = Jsoup.connect(BookContentUrl)
.userAgent(ua)
.proxy(proxy)
.get();
Thread.sleep(4000);
//#content > *:not(p:first-child)
//选择id为content的元素下除第一个p元素外的所有元素
Elements bookContentList = document.select("#content > *:not(p:first-child)");
//toString()
//将Elements对象转化为String对象
bookContent = bookContentList.toString();

return bookContent;
}

Jsoup的选择器

伪选择器selectors

:lt(n): 查找哪些元素的同级索引值(它的位置在DOM树中是相对于它的父节点)小于n,比如:td:lt(3) 表示小于三列的元素

:gt(n):查找哪些元素的同级索引值大于n,比如: div p:gt(2)表示哪些div中有包含2个以上的p元素

:eq(n): 查找哪些元素的同级索引值与n相等,比如:form input:eq(1)表示包含一个input标签的Form元素

:has(seletor): 查找匹配选择器包含元素的元素,比如:div:has(p)表示哪些div包含了p元素

:not(selector): 查找与选择器不匹配的元素,比如: div:not(.logo) 表示不包含 class=logo 元素的所有 div 列表

:contains(text): 查找包含给定文本的元素,搜索不区分大不写,比如: p:contains(jsoup)

:containsOwn(text): 查找直接包含给定文本的元素

:matches(regex): 查找哪些元素的文本匹配指定的正则表达式,比如:div:matches((?i)login)

:matchesOwn(regex): 查找自身包含文本匹配指定正则表达式的元素

异常处理

使用try、catch来进行异常处理,一般在第一次出现异常后,根据异常提示,来进行进一步操作,比如输出信息或者线程休眠(这里注意try是否在循环语句里,可以通过类似**i–**的方法来保证循环的正常进行)

也可以在try语句中主动抛出异常,这里如果bookContentList==null直接存入数据库中,会有空指针的异常。所以提前报错到catch语句里,再一次获取bookContentList

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
try {
Proxy currentProxy = proxies.get(currentProxyIndex);
Document document = Jsoup.connect(BookContentUrl)
.userAgent(ua)
.proxy(currentProxy)
.timeout(1000000000)
.get();
Thread.sleep(4000);
Elements bookContentList = document.select("#content > *:not(p:first-child)");


if(bookContentList!=null){
bookContent = bookContentList.toString();
}else {
throw new SocketException("获取书籍内容失败,将再次尝试获取");
}
}catch (SocketException e){
System.out.println("在爬取小说章节内容时,IP地址被禁止,等待10秒钟...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}catch (IOException e){
System.out.println("在爬取小说章节内容时,IP地址失效,将更换IP...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}

IP代理池的创建

先创建一个List存放IP以及端口以及定义currentProxyIndex存放位置

1
2
3
4
5
6
List<Proxy> proxies = Arrays.asList(
new Proxy(Proxy.Type.HTTP, new InetSocketAddress("139.200.74.178", 4283)),
new Proxy(Proxy.Type.HTTP, new InetSocketAddress("60.169.245.20", 4215)),
new Proxy(Proxy.Type.HTTP, new InetSocketAddress("183.147.27.251", 4231))
);
int currentProxyIndex = 0;

从List中获取到一个IP以及地址,赋值给currentProxy

1
Proxy currentProxy = proxies.get(currentProxyIndex);

然后使用proxy方法使用连接池

1
2
3
Document document = Jsoup.connect(urlEverBook)
.proxy(currentProxy)
.get();

最后是根据情况来更换IP

1
2
3
4
5
catch (SocketException e) {
System.out.println("在爬取小说信息时,IP地址被禁止,将更换IP...");
//下面是换IP的逻辑计算
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}

存入数据库时需要的处理

防止重复数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
//先通过QueryWrapper来约束条件
QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper<>();
BookInfoNameQueryWrapper.eq("book_name", bookName);
BookInfo bookInfo = bookInfoMapper.selectOne(BookInfoNameQueryWrapper);
//通过selectOne后为空,可以判断不存在该数据,则存入数据库
if(bookInfo==null){
BookInfo bookInfo1 = new BookInfo();
bookInfo1.setBookName(bookName);
//·········

bookInfoMapper.insert(bookInfo1);
System.out.println("数据表bookInfo数据存入成功");
}else{
System.out.println("在数据表bookInfo中书名为"+bookName+"的小说在数据库中已经存在");
}
获取其他表的数据存入另一张表
1
2
3
4
5
6
7
8
9
10
//先通过QueryWrapper来约束条件
QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper<>();
BookInfoNameQueryWrapper.eq("book_name", bookName);
BookInfo bookInfoId = bookInfoMapper.selectOne(BookInfoNameQueryWrapper);
//获取到这个ID(这是一个自增的数据,所以需要在其其他数据存入之后,再进行getId来获取)
bookId = bookInfoId.getId();
//存入数据到另外一张数据表book_capter中
BookChapter bookChapter1 = new BookChapter();
bookChapter1.setBookId(bookId);
bookChapterMapper.insert(bookChapter1);
防止数据为空
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
String bookContent = null;
//如果数据bookContent一直为空,则一直进行try的语句
while (bookContent == null){
try {
Proxy currentProxy = proxies.get(currentProxyIndex);
Document document = Jsoup.connect(BookContentUrl)
.userAgent(ua)
.proxy(currentProxy)
.timeout(1000000000)
.get();
Thread.sleep(4000);
Elements bookContentList = document.select("#content > *:not(p:first-child)");

bookContent = bookContentList.toString();

}catch (SocketException e){
System.out.println("在爬取小说章节内容时,IP地址被禁止,等待10秒钟...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}catch (IOException e){
System.out.println("在爬取小说章节内容时,IP地址失效,将更换IP...");
Thread.sleep(10000);
currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
}
}
return bookContent;