ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

给老子爬爬爬!2019国家统计局最新城乡划分代码

2020-03-31 11:04:57  阅读:342  来源: 互联网

标签:String 城乡 System 2019 import println new 国家统计局 out


爬一下最新的行政区划

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html

引入依赖

<!-- https://mvnrepository.com/artifact/com.belerweb/pinyin4j -->
<dependency>
    <groupId>com.belerweb</groupId>
    <artifactId>pinyin4j</artifactId>
    <version>2.5.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
    <groupId>org.projectlombok</groupId>
    <artifactId>lombok</artifactId>
    <version>1.18.8</version>
    <scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
<dependency>
    <groupId>com.squareup.okhttp3</groupId>
    <artifactId>okhttp</artifactId>
    <version>4.4.1</version>
</dependency>

代码

package com.demo.tools;

import com.alibaba.fastjson.JSON;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import okhttp3.ConnectionPool;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

/**
 * Created by 小LUA on 2020-03-30 11:39.
 */
@Slf4j
public class GetProvince {

    private static final OkHttpClient client = new OkHttpClient.Builder()
            .connectTimeout(5, TimeUnit.MINUTES)
            .writeTimeout(5, TimeUnit.MINUTES)
            .readTimeout(5, TimeUnit.MINUTES)
            .connectionPool(new ConnectionPool(0, 30, TimeUnit.MINUTES))
            .build();

    /**
     * 获取首字母
     * @param chinese
     * @return
     */
    private static String getFirstSpell(String chinese) {
        try {
            HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
            defaultFormat.setCaseType(HanyuPinyinCaseType.UPPERCASE);
            String[] temp = new String[0];
            try {
                temp = PinyinHelper.toHanyuPinyinStringArray(chinese.charAt(0), defaultFormat);
            } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
                badHanyuPinyinOutputFormatCombination.printStackTrace();
            }
            return temp[0].charAt(0)+"";
        } catch (Exception e){
            return "";
        }
    }

    /**
     * 读取URL内容
     * @param url
     * @return
     * @throws IOException
     */
    public static String readUrl(String url) throws IOException {
        System.out.println("读取URL:" + url);
        Request request = new Request.Builder()
                .url(url)
                .build();
        Response response = client.newCall(request).execute();
        String body = new String(response.body().bytes(), "gb2312");
//        System.out.println(body);
        return body;
    }


    public static void main(String[] args) throws Exception {
        // 正则
        Pattern pattern = Pattern.compile("[0-9]+");
//        System.out.println(pattern.matcher("划代码").matches());
//        System.out.println(pattern.matcher("110000").matches());

        List<Location> all = new ArrayList<>();
        String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
        // 主页
        Elements provinceList = null;
        do {
            String indexContent = readUrl(baseUrl + "index.html");
            provinceList = Jsoup.parse(indexContent).getElementsByClass("provincetr");
            System.out.println("provinceList是否为空:" + provinceList.isEmpty());
        } while (provinceList.isEmpty());

        for (Element pElement : provinceList) {
            // 获取省名称 + 子地址
            Elements a = pElement.select("a");
            for (Element e : a) {
                String pName = e.text();
                String pHref = e.attr("href");
                String pCode = pHref.substring(0, pHref.indexOf("."));
                String cityUrl = baseUrl + pHref;
                System.out.println(pName + "," + pCode + "," + cityUrl);
                Long provinceCode = Long.valueOf(pCode + "0000");

                // 设置省信息
                Location provinceInfo = new Location();
                provinceInfo.setCode(provinceCode);
                provinceInfo.setName(pName);
                provinceInfo.setLevel(1);
                provinceInfo.setLetterSort(getFirstSpell(pName));
                List<Location> cities = new ArrayList<>();
                provinceInfo.setChilds(cities); // 省下面:市
                all.add(provinceInfo);

                // 读取城市页面
                Elements cityList = null;
                do {
                    String cityContent = readUrl(cityUrl);
                    cityList = Jsoup.parse(cityContent).getElementsByClass("citytr");
                    System.out.println("cityList是否为空:" + cityList.isEmpty());
                } while (cityList.isEmpty());


                for (Element cElement : cityList) {
                    Elements aa = cElement.select("a");
                    for (Element ee : aa) {
                        String cName = ee.text();
                        // 过滤掉比如name为110100000000的数据,只需要取汉字的
                        if (pattern.matcher(cName).matches()){
                            continue;
                        }
                        String cHref = ee.attr("href");
                        String cCode = cHref.substring(cHref.indexOf("/")+1, cHref.indexOf("."));
                        String countyUrl = baseUrl + cHref;
                        System.out.println(cName + "," + cCode + "," + countyUrl);
                        Long cityCode = Long.valueOf(cCode + "00");

                        // 设置城市信息
                        Location city = new Location();
                        city.setCode(cityCode);
                        city.setName(cName);
                        city.setLevel(2);
                        city.setLetterSort(getFirstSpell(cName));
                        List<Location> counties = new ArrayList<>();
                        city.setChilds(counties);  // 市下面:区
                        city.setParentCode(provinceCode);
                        cities.add(city);   // 添加到城市列表


                        // 读取区页面
                        Elements countyList = null;
                        do{
                            String countyContent = readUrl(countyUrl);
                            countyList = Jsoup.parse(countyContent).getElementsByClass("countytr");
                            if ("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4419.html".equals(countyUrl) ||
                            "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4420.html".equals(countyUrl) ||
                            "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/46/4604.html".equals(countyUrl)){
                                countyList = Jsoup.parse(countyContent).getElementsByClass("towntr");
                            }
                            System.out.println("countyList是否为空:" + countyList.isEmpty());
                        } while (countyList.isEmpty());

                        for (Element aElement : countyList) {
                            Elements aaa = aElement.select("a");
                            for (Element eee : aaa) {
                                String aName = eee.text();
                                // 过滤掉比如name为110100000000的数据,只需要取汉字的
                                if (pattern.matcher(aName).matches()){
                                    continue;
                                }
                                String aHref = eee.attr("href");
                                String aCode = aHref.substring(aHref.indexOf("/")+1, aHref.indexOf("."));
                                System.out.println(aName + "," + aCode);

                                // 设置区信息
                                Location county = new Location();
                                county.setCode(Long.valueOf(aCode));
                                county.setName(aName);
                                county.setLevel(3);
                                county.setLetterSort(getFirstSpell(aName));
                                county.setParentCode(cityCode);
                                counties.add(county);   // 添加到区列表
                            }
                        }
                    }
                }

            }
        }
        String jsonString = JSON.toJSONString(all);
        System.out.println(jsonString);
        write(jsonString);
    }

    public static void write(String str) throws IOException {
        FileOutputStream out = new FileOutputStream("2019省市区-大陆.json");
        out.write(str.getBytes());
        out.flush();
        out.close();
    }

}

@Getter
@Setter
class Location{
    private Long code;
    private String name;
    private Integer level;
    private String letterSort;
    private Long parentCode;
    private List<Location> childs;
}

爬完数据我只是存在了json文件里了,如果你需要存到数据库,只需要对 all 进行处理即可。或者读文件再处理

private static void read() throws IOException {
    FileInputStream in = new FileInputStream("2019省市区-大陆.json");
    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
    StringBuilder sb = new StringBuilder();
    String t;
    while ((t = reader.readLine()) != null){
        sb.append(t);
    }
    List<Location> cities = JSONArray.parseArray(sb.toString(), Location.class);
    cities.forEach(e -> {
        // TODO
    });
}

另附:全部数据(很小一部分的名称爬下来就是乱码,导致首字母识别不出来需要手动改正,搜索【"letterSort": ""】)

 

一共有6个,不算多。

数据文件:https://github.com/Mysakura/DataFiles

 

标签:String,城乡,System,2019,import,println,new,国家统计局,out
来源: https://www.cnblogs.com/LUA123/p/12603550.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有