ICode9

精准搜索请尝试: 精确搜索
首页 > 编程语言> 文章详细

基于正则表达式的Java爬虫项目

2022-04-19 14:34:38  阅读:165  来源: 互联网

标签:Java java 正则表达式 matcher 爬虫 System println import out


 

  需求分析:抓取新闻网前100条新闻标题以及对应的网页新闻的链接

  编者这里以齐鲁工业大学校园新闻网为示例,利用Java网络编程、多线程、正则表达式来实现对于新闻内容的抓取。(注:由于校园网限制,不连接齐鲁工业大学校园网可能暂时无法抓取全部内容或抓取的内容会存在缺失是正常情况)

源代码示例:

import java.io.BufferedReader;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.net.URL;

import java.net.URLConnection;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

 

public  class NetUrl {

    //网络爬虫

    //1、向页面发送请求

    //2、获取页面html---纯文本

    //3、解析要的内容---rege、xml、jsoup、json

    //reg 字面量、字符类、数量?+ *

 

 

        public static void testUrl () {

            try {

//                private Lock lock=new ReentrantLock();

//

 

 

                        for (int j = 1; j <= 100; j++) {

////                    StringBuffer sb0=new StringBuffer();

////                    sb0.append("http://www.qlu.edu.cn/gdyw/list"+j+".htm");

                            String content = "https://www.qlu.edu.cn/gdyw/list" + j + ".htm";

////                    System.out.println(sb0.toString());

//                    System.out.println("content = " + content);

 

 

                            URL url = new URL(content);

                            URLConnection urlConnection = url.openConnection();

//            System.out.println("urlConnection.getContentEncoding() = " + urlConnection.getContentEncoding());

//            System.out.println("urlConnection.getContentLengthLong() = " + urlConnection.getContentLengthLong());

//            System.out.println("urlConnection.getContentType() = " + urlConnection.getContentType());

                            //层层嵌套

                            InputStream ins = urlConnection.getInputStream();

                            InputStreamReader inr = new InputStreamReader(ins);

                            BufferedReader br = new BufferedReader(inr);

                            StringBuffer sb = new StringBuffer();

                            for (String l = br.readLine(); l != null; l = br.readLine()) {

                                sb.append(l);

                            }

                            String newsList = sb.toString();

 

                            try {

                                Thread.sleep(1000);

                            } catch (InterruptedException e) {

                                e.printStackTrace();

                            }

//            System.out.println("newsList = " + newsList);

                            Pattern pattern = Pattern.compile("((<li class=\"news n\\d{1,} clearfix\">)(.*?)(</li>))+");

                            Matcher matcher = pattern.matcher(newsList);

//            boolean fd = matcher.find();

//            System.out.println("fd = " + fd);

                            Pattern atag = Pattern.compile("(<a href=')(.*?)(')(.*title=')(.*?)(')");

                            while (matcher.find()) {

//            System.out.println("matcher.groupCount() = " + matcher.groupCount());

//            System.out.println("matcher.group(1) = " + matcher.group(1));

//            System.out.println("matcher.group(2) = " + matcher.group(2));

                                String li = matcher.group(3);

                                Matcher aMatcher = atag.matcher(li);

                                if (aMatcher.find()) {

                                    System.out.println("href = " + aMatcher.group(2));

                                    System.out.println("title = " + aMatcher.group(5));

                                }

 

 

 

//         网页示例       <span style="text-indent:0.99cm;font-size:18px">迈着铿锵的步伐,我们即将挥手告别满载收获与喜悦的</span>

                                URL url2 = new URL("https://www.qlu.edu.cn/"+aMatcher.group(2));

                                System.out.println("url2 = " + url2);

                                URLConnection urlConnection2 = url.openConnection();

                                InputStream ins2 = urlConnection.getInputStream();

                                InputStreamReader inr2 = new InputStreamReader(ins2);

                                BufferedReader br2 = new BufferedReader(inr2);

                        StringBuffer sb2=new StringBuffer();

                                for (String m = br2.readLine(); m != null; m = br2.readLine()) {

                                    sb2.append(m);

                                }

                                Pattern pattern2=Pattern.compile("((<span style=\"text-indent:0.99cm;font-size:18px\">)(.*?)(</span>))+");

                                String newSpan = sb2.toString();

                                Matcher matcher1=pattern.matcher(newSpan);

                                while (matcher1.find()) {

                                    boolean fd=matcher1.find();

                                    System.out.println("fd = " + fd);

                                    System.out.println("matcher1.groupCount() = " + matcher1.groupCount());

                                    String span = matcher.group(3);

                                    System.out.println("newSpan = " + newSpan);

                                    System.out.println("matcher1.group(3) = " + matcher1.group(3));

                                }

 

 

 

 

 

 

 

 

//            System.out.println("matcher.group(4) = " + matcher.group(4));

                            }

                            System.out.println(Thread.currentThread().getName()+"_____________——抓取");

                            ins.close();

                            inr.close();

                            br.close();

 

 

//                    System.out.println(content);

                        }

 

                    }

                    catch(Exception e){

                        System.out.println("e.getMessage() = " + e.getMessage());

                    }

                }

 

 

 

    public static void main(String[] args) throws Exception {

//            1.创建一个五个线程的线程池

        ExecutorService es=Executors.newFixedThreadPool(5);

//        2.提交任务

        Runnable run =new Runnable() {

            @Override

            public void run() {

                while(true){

                    synchronized(NetUrl.class) {

                        testUrl();

                    }

                }

            }

        };

//        3.提交任务

        for (int i=0;i <= 5;i++){

            es.submit(run);

        }

 

//        4.关闭线程池

        es.shutdown();

    }

}

 

上述为网页源代码的示例:

标签:Java,java,正则表达式,matcher,爬虫,System,println,import,out
来源: https://www.cnblogs.com/ZhouSunYu/p/16165071.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有