Newer
Older
talkCrawler / src / talkcrawler / MonthlyIndex.java
@haya4 haya4 on 10 Mar 2019 4 KB ok
package talkcrawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonArrayBuilder;

public class MonthlyIndex {
    final static String MONTHLY_INDEX = "https://lists.openstreetmap.org/pipermail/talk-ja";
    JsonArrayBuilder arryBuild;
    
    /**
     * 
     * @throws java.io.IOException 
     */
    public MonthlyIndex() throws IOException {
        this.arryBuild = Json.createArrayBuilder();
    }

    @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
    public void load() {
        try {
            URL url = new URL(MONTHLY_INDEX);
            HttpURLConnection http = (HttpURLConnection)url.openConnection();
            http.setRequestMethod("GET");
            http.connect();
            
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(http.getInputStream()))) {
                getHtml(reader);
            }
        }
        catch (Exception ex) {
            Logger.getLogger(MonthlyIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    
    void getHtml(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<HTML>")) {
                in = true;
            }
            if (in) {
                getBody(reader);
            }
            if (str.toUpperCase().endsWith("</HTML>")) {
                in = false;
            }
        }
    }
    
    void getBody(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<BODY")) {
                in = true;
            }
            if (in) {
                getTable(reader);
            }
            if (str.toUpperCase().endsWith("</BODY>")) {
                in = false;
            }
        }
    }
    
    void getTable(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<TABLE")) {
                in = true;
            }
            if (in) {
                getTr(reader);
            }
            if (str.toUpperCase().endsWith("</TABLE>")) {
                in = false;
            }
        }
    }
    
    void getTr(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<TR")) {
                in = true;
            }
            if (in) {
                getA(reader);
            }
            if (str.toUpperCase().endsWith("</TR>")) {
                in = false;
            }
        }
    }
    
    void getA(BufferedReader reader) throws IOException {
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<A HREF=\"") && str.toUpperCase().endsWith("\">[ 日付 ]</A>")) {
                String path = str.substring(9, str.length() - 12);
                String dir = "";
                String name = "";
                StringTokenizer st = new StringTokenizer(path, "/");
                if (st.hasMoreTokens()) {
                    dir = st.nextToken().trim();
                }
                if (st.hasMoreTokens()) {
                    name = st.nextToken().trim();
                }

                System.out.println(String.format("%s/%s/%s", MONTHLY_INDEX, dir, name));
                DailyIndex daily = new DailyIndex(MONTHLY_INDEX, dir, name);
                daily.load();
            }
        }
        Article.post(arryBuild.build());
    }
}