Java爬虫 实现目标: 1、要求爬到该站点的所有小说,(且具有定时去查看小说更新的功能) 2、爬到的内容要求包含简介、作者名称、封面图片、小说名称、具体小说章节、最新更新章节、完结状态 3、上述要求内容还必须存到数据库中 4、小说内容应该以怎么样的形式存到数据库中
实现方法 1、框架:seimicrawler、jsoup、WebMagic(决定采用jsoup)
2、采用selenium这类谷歌测试工具来爬取动态加载的网页
python思路 先摆上python的静态 爬虫提供相似思路(使用Xpath的方法找到标签)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 url = 'https://www.zwwx.com/book/67/67510/' headers = {'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/' '537.36 Edg/103.0.1264.49' } response = requests.get(url=url, headers=headers) response.encoding = response.apparent_encoding href = re.findall('<dd><a href="(.*?)">.*?</a></dd>' , response.text) name = re.findall('<dt>(.*?)</dt>' , response.text)[0 ] for index in href: index_url = 'https://www.zwwx.com' + index res = requests.get(url=index_url, headers=headers) res.encoding = res.apparent_encoding selector = parsel.Selector(res.text) title = selector.css('.bookname > h1::text' ).get() content_list = selector.css('#content::text' ).getall() content = '\n' .join(content_list) with open (f'novel\\{name} .txt' , mode='a' , encoding='utf-8' ) as f: f.write(title) f.write('\n' ) f.write(content) f.write('\n' ) time.sleep(0.5 ) print ('正在保存:' , title)
动态 爬虫
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 opt = Options() opt.add_argument('--headless' ) opt.add_argument('--disable-gpu' ) driver = Chrome(options=opt) url = 'https://www.maofly.com/manga/6996/451102.html' time.sleep(3 ) driver.get(url=url) for i in range (1 , 10000 ): result = es.alert_is_present()(driver) if result: print (result.text) result.accept() time.sleep(3 ) i = 1 else : print ('没有警告窗' ) res = driver.page_source selector = parsel.Selector(res) img_url = selector.xpath('//*[@id="all"]/div/div[2]/div[1]/img/@src' ).get() pic_title = selector.xpath('/html/body/div/h2/text()' ).get() pic_name = selector.xpath('/html/body/div/h1/a/text()' ).get() if not os.path.exists('img\\' + pic_name): os.mkdir('img\\' + pic_name) img = requests.get(url=img_url).content name = pic_title, i print ('正在保存:' , name, img_url) with open (f'img\\{pic_name} \\{name} .jpg' , mode='wb' ) as f: f.write(img) print ('保存完成:' , name) button = driver.find_element(By.XPATH, '/html/body/div/div[2]/nav/div/a[4]' ) button.click()
在爬取数据方面我觉得在看完狂神的视频后问题不大,最大的问题是数据库写入的实现,以及不定期爬取更新资源的实现(可以等网站测试做完再说)
Jsoup 爬取小说地址具体代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 public List<String> getBookUrl1 () throws IOException, InterruptedException { ArrayList<String> bookUrlList1 = new ArrayList <>(); int count=0 ; int max = 5 ; for (int i=1 ;i<=max;i++){ String urlEver = "https://www.9biqu.com/class/1/" +i+".html" ; try { Proxy currentProxy = proxies.get(currentProxyIndex); Document document = Jsoup.connect(urlEver) .userAgent(ua) .proxy(currentProxy) .timeout(1000000000 ) .get(); Thread.sleep(3000 ); Elements NameUrlList = document.select("#newscontent > div.update-list > div > div > ul>li" ); for (Element el : NameUrlList){ String BookUrlPart = el.select("span.s2 > a" ).attr("href" ); String BookUrl = "https://www.9biqu.com" + BookUrlPart; bookUrlList1.add(BookUrl); count++; System.out.println("玄幻小说地址采集完成" +count+"个" ); } } catch (SocketException e) { System.out.println("IP地址被禁止,等待10秒钟..." ); Thread.sleep(10000 ); i--; currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }catch (IOException e){ System.out.println("在爬取小说地址时,IP地址失效,将更换IP..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); } } System.out.println("玄幻小说已经全部采集完成" ); return bookUrlList1; }
爬取小说信息具体代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 public List<BookInfo> listBookDetails () throws IOException, InterruptedException { int count = 0 ; BUrl1 = getBookUrl1(); for (int i=0 ;i< BUrl1.size();i++){ String urlEverBook=BUrl1.get(i); try { count++; Proxy currentProxy = proxies.get(currentProxyIndex); Document document = Jsoup.connect(urlEverBook) .userAgent(ua) .proxy(currentProxy) .timeout(1000000000 ) .get(); String bookName = document.select("#info > h1" ).text(); String authorName = document.select("#info > p:nth-child(2) > a" ).text(); String picUrl = "https://www.9biqu.com" + document.select("#fmimg > img" ).attr("src" ); String lastChapterName = document.select("#info > p:nth-child(5) > a" ).text(); if (authorName.equals("小说免费阅读" )){ authorName="Tec" ; } String bookIntro = document.select("#intro" ).text(); System.out.println("第" +count+"本书名:" +bookName); System.out.println("第" +count+"本作者名:" +authorName); System.out.println("第" +count+"本介绍:" +bookIntro); System.out.println("第" +count+"本图片链接:" +picUrl); System.out.println("第" +count+"本最新章名:" +lastChapterName); QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper <>(); BookInfoNameQueryWrapper.eq("book_name" , bookName); BookInfo bookInfo = bookInfoMapper.selectOne(BookInfoNameQueryWrapper); if (bookInfo==null ){ BookInfo bookInfo1 = new BookInfo (); bookInfo1.setWorkDirection(0 ); bookInfo1.setCategoryId(1L ); bookInfo1.setCategoryName("玄幻奇幻" ); bookInfo1.setPicUrl(picUrl); bookInfo1.setBookName(bookName); bookInfo1.setAuthorId(0L ); bookInfo1.setAuthorName(authorName); bookInfo1.setBookDesc(bookIntro); bookInfo1.setScore(6 ); bookInfo1.setBookStatus(0 ); bookInfo1.setVisitCount(100L ); bookInfo1.setCommentCount(0 ); bookInfo1.setLastChapterName(lastChapterName); bookInfo1.setLastChapterUpdateTime(LocalDateTime.now()); bookInfo1.setCreateTime(LocalDateTime.now()); bookInfo1.setUpdateTime(LocalDateTime.now()); bookInfo1.setIsVip(0 ); bookInfoMapper.insert(bookInfo1); System.out.println("数据表bookInfo数据存入成功" ); }else { System.out.println("在数据表bookInfo中书名为" +bookName+"的小说在数据库中已经存在" ); } Thread.sleep(4000 ); } catch (SocketException e) { System.out.println("在爬取小说信息时,IP地址被禁止,等待10秒钟..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }catch (IOException e){ System.out.println("在爬取小说信息时,IP地址失效,将更换IP..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); } } return null ; }
爬取小说章节内容具体代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 public String getBookContent (String BookContentUrl) throws IOException, InterruptedException { String bookContent = null ; while (bookContent == null ){ try { Proxy currentProxy = proxies.get(currentProxyIndex); Document document = Jsoup.connect(BookContentUrl) .userAgent(ua) .proxy(currentProxy) .timeout(1000000000 ) .get(); Thread.sleep(4000 ); Elements bookContentList = document.select("#content > *:not(p:first-child)" ); bookContent = bookContentList.toString(); }catch (SocketException e){ System.out.println("在爬取小说章节内容时,IP地址被禁止,等待10秒钟..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }catch (IOException e){ System.out.println("在爬取小说章节内容时,IP地址失效,将更换IP..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); } } return bookContent; } public int countChineseCharacters (String bookContent) { if (bookContent == null || bookContent.trim().isEmpty()) { return 0 ; } int count = 0 ; String[] words = bookContent.trim().split("\\s+" ); for (String word : words) { for (char c : word.toCharArray()) { if (isChineseCharacter(c)) { count++; } } } return count; } public boolean isChineseCharacter (char c) { Character.UnicodeScript script = Character.UnicodeScript.of(c); return script == Character.UnicodeScript.HAN; }
爬取小说章节具体代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 public List<BookInfo> listBookContent () throws IOException, InterruptedException { int count = 0 ; int countChapter = 0 ; BUrl1 = getBookUrl1(); for (int i=0 ;i< BUrl1.size();i++){ String urlEverBook=BUrl1.get(i); try { count++; Proxy currentProxy = proxies.get(currentProxyIndex); Document document = Jsoup.connect(urlEverBook) .userAgent(ua) .proxy(currentProxy) .timeout(1000000000 ) .get(); String bookName = document.select("#info > h1" ).text(); String bookChapterName; String bookChapterUrlPart; String bookChapterUrl; String bookContent; Long bookId= 0L ; Elements bookChapter = document.select("#list > dl > dd:gt(13)" ); for (Element el:bookChapter){ countChapter++; bookChapterName = el.select("a" ).text(); bookChapterUrlPart = el.select("a" ).attr("href" ); bookChapterUrl = "https://www.9biqu.com" + bookChapterUrlPart; bookContent = getBookContent(bookChapterUrl); int chineseCharCount = countChineseCharacters(bookContent); System.out.println("第" +count+"本的第" +countChapter+"章名" +bookChapterName); System.out.println("第" +count+"本的第" +countChapter+"章链接:" +bookChapterUrl); QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper <>(); BookInfoNameQueryWrapper.eq("book_name" , bookName); BookInfo bookInfoText = bookInfoMapper.selectOne(BookInfoNameQueryWrapper); BookInfo bookInfoId = bookInfoMapper.selectOne(BookInfoNameQueryWrapper); bookId = bookInfoId.getId(); QueryWrapper<BookChapter>BookChapterTextQueryWrapper = new QueryWrapper <>(); BookChapterTextQueryWrapper.eq("chapter_name" ,bookChapterName); BookChapterTextQueryWrapper.eq("book_id" ,bookId); BookChapter bookChapterText = bookChapterMapper.selectOne(BookChapterTextQueryWrapper); if (bookInfoText != null ){ if (bookChapterText == null ) { BookChapter bookChapter1 = new BookChapter (); bookChapter1.setBookId(bookId); bookChapter1.setChapterNum(countChapter); bookChapter1.setChapterName(bookChapterName); bookChapter1.setWordCount(chineseCharCount); bookChapter1.setIsVip(0 ); bookChapter1.setCreateTime(LocalDateTime.now()); bookChapter1.setUpdateTime(LocalDateTime.now()); bookChapterMapper.insert(bookChapter1); System.out.println("数据表bookChapter数据存入成功" ); } else { System.out.println("在数据表bookChapter中章节名为" +bookChapterName+"的小说" +bookName+"在数据库中已经存在" ); } }else { System.out.println("在数据表bookChapter中书名为" +bookName+"的小说在数据库中不存在" ); } Long chapterId = 0L ; QueryWrapper<BookChapter> bookChapterQueryWrapper = new QueryWrapper <>(); bookChapterQueryWrapper.eq("book_id" , bookId); bookChapterQueryWrapper.eq("chapter_name" , bookChapterName); BookChapter bookChapterText2 = bookChapterMapper.selectOne(bookChapterQueryWrapper); if ( bookChapterText2 != null ) { chapterId = bookChapterText2.getId(); } QueryWrapper<BookContent>BookContentQueryWrapper =new QueryWrapper <>(); BookContentQueryWrapper.eq("chapter_id" ,chapterId); BookContent bookContentText = bookContentMapper.selectOne(BookContentQueryWrapper); if (bookContentText==null && bookInfoText !=null ){ BookContent bookContent1 = new BookContent (); bookContent1.setChapterId(chapterId); bookContent1.setContent(bookContent); bookContent1.setCreateTime(LocalDateTime.now()); bookContent1.setUpdateTime(LocalDateTime.now()); bookContentMapper.insert(bookContent1); System.out.println("数据表bookContent数据存入成功" ); }else { System.out.println("在数据表bookContent中书名为" +bookName+"的小说在数据库中已经存在" ); } } Thread.sleep(4000 ); } catch (SocketException e) { System.out.println("在爬取小说章节时,IP地址被禁止,等待10秒钟..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }catch (IOException e){ System.out.println("在爬取小说章节时,IP地址失效,将更换IP..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); } } return null ; }
相关知识点 解析html的两种方式 1、Jsoup.parse
1 Document document= Jsoup.parse(new URL (urlEver), 300000 );
2、Jsoup.connect
userAgent更换访问头
proxy更换IP
timeout防止超时异常
1 2 3 4 5 Document document = Jsoup.connect(urlEverBook) .userAgent(ua) .proxy(currentProxy) .timeout(1000000000 ) .get();
获取元素的方式 DOM解析 .getElementsByTag(“h1”)——标签名
getElementById(“intro”)——id名
.text()获取文字
.attr(“href”)获取元素的某个属性
1 2 String bookName = document.getElementsByTag("h1" ).get(0 ).text();String src = el.getElementsByTag("a" ).eq(0 ).attr("href" );
CSS选择器 .select(”#info > h1”)——selector地址
.text()获取文字
.attr(“href”)获取元素的某个属性
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 String bookName = document.select("#info > h1" ).text();Elements NameUrlList = document.select("#newscontent > div.update-list > div > div > ul>li" );for (Element el : NameUrlList){ String BookUrlPart = el.select("span.s2 > a" ).attr("href" ); String BookUrl = "https://www.9biqu.com" + BookUrlPart; bookUrlList1.add(BookUrl); count++; System.out.println("小说地址采集完成" +count+"个" ); } Elements bookChapter = document.select("#list > dl > dd:gt(13)" );for (Element el:bookChapter){ countChapter++; bookChapterName = el.select("a" ).text(); bookChapterUrlPart = el.select("a" ).attr("href" ); bookChapterUrl = "https://www.9biqu.com" + bookChapterUrlPart; System.out.println("第" +count+"本的第" +countChapter+"章名" +bookChapterName); System.out.println("第" +count+"本的第" +countChapter+"章链接" +bookChapterUrl); chapterNameList.add(bookChapterName); chapterUrlList.add(bookChapterUrl); } public String getBookContent (String BookContentUrl) throws IOException, InterruptedException { String bookContent; Document document = Jsoup.connect(BookContentUrl) .userAgent(ua) .proxy(proxy) .get(); Thread.sleep(4000 ); Elements bookContentList = document.select("#content > *:not(p:first-child)" ); bookContent = bookContentList.toString(); return bookContent; }
Jsoup的选择器 伪选择器selectors :lt(n): 查找哪些元素的同级索引值(它的位置在DOM树中是相对于它的父节点)小于n,比如:td:lt(3) 表示小于三列的元素
:gt(n):查找哪些元素的同级索引值大于n,比如: div p:gt(2)表示哪些div中有包含2个以上的p元素
:eq(n): 查找哪些元素的同级索引值与n相等,比如:form input:eq(1)表示包含一个input标签的Form元素
:has(seletor): 查找匹配选择器包含元素的元素,比如:div:has(p)表示哪些div包含了p元素
:not(selector): 查找与选择器不匹配的元素,比如: div:not(.logo) 表示不包含 class=logo 元素的所有 div 列表
:contains(text): 查找包含给定文本的元素,搜索不区分大不写,比如: p:contains(jsoup)
:containsOwn(text): 查找直接包含给定文本的元素
:matches(regex): 查找哪些元素的文本匹配指定的正则表达式,比如:div:matches((?i)login)
:matchesOwn(regex): 查找自身包含文本匹配指定正则表达式的元素
异常处理 使用try、catch来进行异常处理,一般在第一次出现异常后,根据异常提示,来进行进一步操作,比如输出信息或者线程休眠(这里注意try是否在循环语句里,可以通过类似**i–**的方法来保证循环的正常进行)
也可以在try语句中主动抛出异常,这里如果bookContentList==null直接存入数据库中,会有空指针的异常。所以提前报错到catch语句里,再一次获取bookContentList
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 try { Proxy currentProxy = proxies.get(currentProxyIndex); Document document = Jsoup.connect(BookContentUrl) .userAgent(ua) .proxy(currentProxy) .timeout(1000000000 ) .get(); Thread.sleep(4000 ); Elements bookContentList = document.select("#content > *:not(p:first-child)" ); if (bookContentList!=null ){ bookContent = bookContentList.toString(); }else { throw new SocketException ("获取书籍内容失败,将再次尝试获取" ); } }catch (SocketException e){ System.out.println("在爬取小说章节内容时,IP地址被禁止,等待10秒钟..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }catch (IOException e){ System.out.println("在爬取小说章节内容时,IP地址失效,将更换IP..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }
IP代理池的创建 先创建一个List存放IP以及端口以及定义currentProxyIndex存放位置
1 2 3 4 5 6 List<Proxy> proxies = Arrays.asList( new Proxy (Proxy.Type.HTTP, new InetSocketAddress ("139.200.74.178" , 4283 )), new Proxy (Proxy.Type.HTTP, new InetSocketAddress ("60.169.245.20" , 4215 )), new Proxy (Proxy.Type.HTTP, new InetSocketAddress ("183.147.27.251" , 4231 )) ); int currentProxyIndex = 0 ;
从List中获取到一个IP以及地址,赋值给currentProxy
1 Proxy currentProxy = proxies.get(currentProxyIndex);
然后使用proxy方法使用连接池
1 2 3 Document document = Jsoup.connect(urlEverBook) .proxy(currentProxy) .get();
最后是根据情况来更换IP
1 2 3 4 5 catch (SocketException e) { System.out.println("在爬取小说信息时,IP地址被禁止,将更换IP..." ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }
存入数据库时需要的处理 防止重复数据 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper <>(); BookInfoNameQueryWrapper.eq("book_name" , bookName); BookInfo bookInfo = bookInfoMapper.selectOne(BookInfoNameQueryWrapper); if (bookInfo==null ){ BookInfo bookInfo1 = new BookInfo (); bookInfo1.setBookName(bookName); bookInfoMapper.insert(bookInfo1); System.out.println("数据表bookInfo数据存入成功" ); }else { System.out.println("在数据表bookInfo中书名为" +bookName+"的小说在数据库中已经存在" ); }
获取其他表的数据存入另一张表 1 2 3 4 5 6 7 8 9 10 QueryWrapper<BookInfo>BookInfoNameQueryWrapper =new QueryWrapper <>(); BookInfoNameQueryWrapper.eq("book_name" , bookName); BookInfo bookInfoId = bookInfoMapper.selectOne(BookInfoNameQueryWrapper); bookId = bookInfoId.getId(); BookChapter bookChapter1 = new BookChapter (); bookChapter1.setBookId(bookId); bookChapterMapper.insert(bookChapter1);
防止数据为空 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 String bookContent = null ; while (bookContent == null ){ try { Proxy currentProxy = proxies.get(currentProxyIndex); Document document = Jsoup.connect(BookContentUrl) .userAgent(ua) .proxy(currentProxy) .timeout(1000000000 ) .get(); Thread.sleep(4000 ); Elements bookContentList = document.select("#content > *:not(p:first-child)" ); bookContent = bookContentList.toString(); }catch (SocketException e){ System.out.println("在爬取小说章节内容时,IP地址被禁止,等待10秒钟..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); }catch (IOException e){ System.out.println("在爬取小说章节内容时,IP地址失效,将更换IP..." ); Thread.sleep(10000 ); currentProxyIndex = (currentProxyIndex + 1 ) % proxies.size(); } } return bookContent;