ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

jsoup爬取全国省市区

2021-02-02 12:02:17  阅读:213  来源: 互联网

标签:code name Haha 爬取 jsoup connect parentId 省市区 id


项目中经常用到全国省市区的数据表,但是这个是数据又会经常变动,每次都需要找最新的数据,很麻烦,特此记录一下,用jsoup爬取国家统计局的数据。

1.引入jar包

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>

2.创建数据表,生成Haha实体类

DROP TABLE IF EXISTS `haha`;
CREATE TABLE `haha` (
`id` int NOT NULL AUTO_INCREMENT,
`code` bigint DEFAULT NULL,
`name` varchar(255) NOT NULL,
`parent_id` int NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=3647 DEFAULT CHARSET=utf8;

 

3.实现接口

//省份代码json
        String provinceJson = "[{\"code\":110000,\"id\":1,\"name\":\"北京市\",\"parentId\":0}," +
                "{\"code\":120000,\"id\":2,\"name\":\"天津市\",\"parentId\":0}," +
                "{\"code\":130000,\"id\":3,\"name\":\"河北省\",\"parentId\":0}," +
                "{\"code\":140000,\"id\":4,\"name\":\"山西省\",\"parentId\":0}," +
                "{\"code\":150000,\"id\":5,\"name\":\"内蒙古自治区\",\"parentId\":0}," +
                "{\"code\":210000,\"id\":6,\"name\":\"辽宁省\",\"parentId\":0}," +
                "{\"code\":220000,\"id\":7,\"name\":\"吉林省\",\"parentId\":0}," +
                "{\"code\":230000,\"id\":8,\"name\":\"黑龙江省\",\"parentId\":0}," +
                "{\"code\":310000,\"id\":9,\"name\":\"上海市\",\"parentId\":0}," +
                "{\"code\":320000,\"id\":10,\"name\":\"江苏省\",\"parentId\":0}," +
                "{\"code\":330000,\"id\":11,\"name\":\"浙江省\",\"parentId\":0}," +
                "{\"code\":340000,\"id\":12,\"name\":\"安徽省\",\"parentId\":0}," +
                "{\"code\":350000,\"id\":13,\"name\":\"福建省\",\"parentId\":0}," +
                "{\"code\":360000,\"id\":14,\"name\":\"江西省\",\"parentId\":0}," +
                "{\"code\":370000,\"id\":15,\"name\":\"山东省\",\"parentId\":0}," +
                "{\"code\":410000,\"id\":16,\"name\":\"河南省\",\"parentId\":0}," +
                "{\"code\":420000,\"id\":17,\"name\":\"湖北省\",\"parentId\":0}," +
                "{\"code\":430000,\"id\":18,\"name\":\"湖南省\",\"parentId\":0}," +
                "{\"code\":440000,\"id\":19,\"name\":\"广东省\",\"parentId\":0}," +
                "{\"code\":450000,\"id\":20,\"name\":\"广西壮族自治区\",\"parentId\":0}," +
                "{\"code\":460000,\"id\":21,\"name\":\"海南省\",\"parentId\":0}," +
                "{\"code\":500000,\"id\":22,\"name\":\"重庆市\",\"parentId\":0}," +
                "{\"code\":510000,\"id\":23,\"name\":\"四川省\",\"parentId\":0}," +
                "{\"code\":520000,\"id\":24,\"name\":\"贵州省\",\"parentId\":0}," +
                "{\"code\":530000,\"id\":25,\"name\":\"云南省\",\"parentId\":0}," +
                "{\"code\":540000,\"id\":26,\"name\":\"西藏自治区\",\"parentId\":0}," +
                "{\"code\":610000,\"id\":27,\"name\":\"陕西省\",\"parentId\":0}," +
                "{\"code\":620000,\"id\":28,\"name\":\"甘肃省\",\"parentId\":0}," +
                "{\"code\":630000,\"id\":29,\"name\":\"青海省\",\"parentId\":0}," +
                "{\"code\":640000,\"id\":30,\"name\":\"宁夏回族自治区\",\"parentId\":0}," +
                "{\"code\":650000,\"id\":31,\"name\":\"新疆维吾尔自治区\",\"parentId\":0}," +
                "{\"code\":710000,\"id\":32,\"name\":\"台湾\",\"parentId\":0}," +
                "{\"code\":810000,\"id\":33,\"name\":\"香港特别行政区\",\"parentId\":0}," +
                "{\"code\":820000,\"id\":34,\"name\":\"澳门特别行政区\",\"parentId\":0}]\n";

        JSONArray array = JSONArray.parseArray(provinceJson);
        //国家统计局地址
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";

        //链接到目标地址
        Connection connect = Jsoup.connect(url);
        //设置useragent,设置超时时间,并以get请求方式请求服务器
        Document document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                timeout(6000).ignoreContentType(true).get();

        //获取省份列表
        ListIterator<Element> elements = document.getElementsByClass("provincetr").listIterator();

        while (elements.hasNext()) {
            ListIterator<Element> tds = elements.next().children().listIterator();

            while (tds.hasNext()) {
                Element element = tds.next().child(0);
                String provinceName = element.text();

                Haha province = new Haha();
                province.setName(provinceName);
                for (int i = 0; i < array.size(); i++) {
                    JSONObject json = array.getJSONObject(i);
                    if (provinceName.equals(json.getString("name"))) {
                        province.setCode(json.getLong("code"));
                    }
                }

                province.save();

                url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + element.attr("href");
                connect = Jsoup.connect(url);
                document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                        timeout(6000).ignoreContentType(true).get();
                ListIterator<Element> citys = document.getElementsByClass("citytr").listIterator();
                while (citys.hasNext()) {
                    ListIterator<Element> as = citys.next().getElementsByTag("a").listIterator();

                    int index = 1;
                    Haha city = new Haha();

                    while (as.hasNext()) {
                        Element c = as.next();
                        if (index == 1) {
                            index++;
                            city.setCode(Long.parseLong(c.text().trim()) / 1000000);
                        } else {
                            url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + c.attr("href");
                            city.setName(c.text().trim());
                        }
                    }
                    city.setParentId(province.getId());
                    city.save();

                    connect = Jsoup.connect(url);
                    document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                            timeout(6000).ignoreContentType(true).get();

                    ListIterator<Element> countys = document.getElementsByClass("countytr").listIterator();

                    while (countys.hasNext()) {
                        ListIterator<Element> couna = countys.next().getElementsByTag("td").listIterator();

                        Haha county = new Haha();
                        int countIndex = 1;
                        while (couna.hasNext()) {
                            Element a = couna.next();
                            if (countIndex == 1) {
                                countIndex++;
                                county.setCode(Long.parseLong(a.text().trim()) / 1000000);
                            } else {
                                county.setName(a.text().trim());
                            }
                        }
                        county.setParentId(city.getId());
                        county.save();
                    }
                }
            }
        }
View Code

 

4.请求接口

http://localhost/demo

 

标签:code,name,Haha,爬取,jsoup,connect,parentId,省市区,id
来源: https://www.cnblogs.com/lihongjunjava/p/14361205.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有