Newer
Older
talkCrawler / src / talkcrawler / MonthlyIndex.java
package talkcrawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.json.Json;
import javax.json.JsonArrayBuilder;

public class MonthlyIndex {
    //final static String MONTHLY_INDEX = "https://lists.openstreetmap.org/pipermail/talk-ja";
    TalkCrawlerProperties prop;
    JsonArrayBuilder arryBuild;
    URL monthlyIndexUrl;

    /**
     * 
     * @param prop
     * @throws java.io.IOException 
     */
    public MonthlyIndex(TalkCrawlerProperties prop) throws IOException {
        this.prop = prop;
        this.arryBuild = Json.createArrayBuilder();
        this.monthlyIndexUrl = new URL(prop.getProperty("MONTHLY_INDEX"));
    }

    @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
    public void load() {
        try {
            HttpURLConnection http = (HttpURLConnection)monthlyIndexUrl.openConnection();
            http.setRequestMethod("GET");
            http.connect();
            
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(http.getInputStream()))) {
                getHtml(reader);
            }
        }
        catch (Exception ex) {
            Logger.getLogger(MonthlyIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    

    void getHtml(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<HTML>")) {
                in = true;
            }
            if (in) {
                getBody(reader);
            }
            if (str.toUpperCase().endsWith("</HTML>")) {
                in = false;
            }
        }
    }
    
    void getBody(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<BODY")) {
                in = true;
            }
            if (in) {
                getTable(reader);
            }
            if (str.toUpperCase().endsWith("</BODY>")) {
                in = false;
            }
        }
    }
    
    void getTable(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<TABLE")) {
                in = true;
            }
            if (in) {
                getTr(reader);
            }
            if (str.toUpperCase().endsWith("</TABLE>")) {
                in = false;
            }
        }
    }
    
    void getTr(BufferedReader reader) throws IOException {
        boolean in = false;
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<TR")) {
                in = true;
            }
            if (in) {
                getA(reader);
            }
            if (str.toUpperCase().endsWith("</TR>")) {
                in = false;
            }
        }
    }
    
    void getA(BufferedReader reader) throws IOException {
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<A HREF=\"") && str.toUpperCase().endsWith("\">[ 日付 ]</A>")) {
                String path = str.substring(9, str.length() - 12);
                String dir = "";
                String name = "";
                StringTokenizer st = new StringTokenizer(path, "/");
                if (st.hasMoreTokens()) {
                    dir = st.nextToken().trim();
                }
                if (st.hasMoreTokens()) {
                    name = st.nextToken().trim();
                }

                System.out.println(String.format("./%s/%s", dir, name));
                DailyIndex daily = new DailyIndex(prop, dir, name);
                daily.load();
            }
        }
        //new Post().post(arryBuild.build());
    }
}