talkCrawler/src/talkcrawler/DailyIndex.java at cba2633c4744e15e0ff53c46be6684614f0a53ff

Fork: 0
haya4 / talkCrawler
Find file
Newer
Older
talkCrawler / src / talkcrawler / DailyIndex.java
haya4 on 7 Jul 2019 5 KB 前日の月の記事をクロールする
Raw Blame History
package talkcrawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Locale;
import java.util.StringTokenizer;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.logging.Logger;
import tool.http.Post;

public class DailyIndex {
    String path;
    String dir;
    String file;
    TalkCrawlerProperties prop;

    /**
     * COMMAND 
     * 
     * @param args
     * @throws IOException 
     */
    public static void main(String[] args) throws IOException {
        Calendar calendar = Calendar.getInstance();
        calendar.setTime(new Date());
        calendar.add(Calendar.DAY_OF_MONTH, -1);
        
        DateFormat df = new SimpleDateFormat("yyyy-MMMMMMMM", Locale.UK);
        df.setTimeZone(TimeZone.getTimeZone("GMT"));
        String monthly = df.format(calendar.getTime());
        
        TalkCrawlerProperties prop = new TalkCrawlerProperties().load();
        DailyIndex ins = new DailyIndex(prop, monthly, "date.html");
        ins.load();
    }

    /**
     * 
     * @param prop      MonthlyIndex.MONTHLY_INDEX
     * @param dir
     * @param file
     * @throws java.io.IOException 
     */
    public DailyIndex(TalkCrawlerProperties prop, String dir, String file) throws IOException {
        this.prop = prop;
        this.path = prop.getProperty("MONTHLY_INDEX");
        this.dir = dir;
        this.file = file;
    }

    @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
    public void load() {
        boolean euc = false;
        try {
            URL url = new URL(String.format("%s/%s/%s", path, dir, file));
            HttpURLConnection http = (HttpURLConnection)url.openConnection();
            http.setRequestMethod("GET");
            http.connect();
            try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
                String line;
                while((line = rd.readLine()) != null) {
                    String str = line.trim().toUpperCase();
                    if (str.startsWith("<META ")) {
                        if (str.contains("CHARSET=EUC-JP")) {
                            euc = true;
                            break;
                        }
                    }
                    if (str.startsWith("<BODY")) {
                        break;
                    }
                }
                if (!euc) {
                    getLi(rd);
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
            return;
        }

        try {
            if (euc) {
                URL url = new URL(String.format("%s/%s/%s", path, dir, file));
                HttpURLConnection http = (HttpURLConnection)url.openConnection();
                http.setRequestMethod("GET");
                http.connect();
                try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) {
                    getLi(rd);
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    
    void getLi(BufferedReader reader) throws IOException {
        String line;
        String title = "";
        String id = "";
        String name = "";
        boolean datain = false;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<b>記事数:</b>")) {
                datain = true;
            }
            if (str.toUpperCase().startsWith("<LI>")) {
                String str1 = str.substring(4);     // 4 <-- length('<LI>')
                title = getTitle(str1);
                id = getId(str1);
            }
            if (str.toUpperCase().startsWith("<I>")) {
                name = str.substring(3);
            }
            if (str.equals("</I>")) {
                System.out.println("-----");
                Article article = new Article(path, dir, id);
                article.load();
                new Post(prop).post(Article.toJsonArray(article.toJsonObject()));
            }
            if (datain && str.toUpperCase().startsWith("</UL>")) {
                break;
            }
        }
    }
    
    String getTitle(String str) throws IOException {
        String title = "";
        StringTokenizer st = new StringTokenizer(str, ">");
        if (st.hasMoreTokens()) {
            String no = st.nextToken().trim();
        }
        if (st.hasMoreTokens()) {
            title = st.nextToken().trim();
        }
        return title;
    }
    
    String getId(String str) throws IOException {
        String file1 = "";
        StringTokenizer st = new StringTokenizer(str, ">");
        if (st.hasMoreTokens()) {
            String no = st.nextToken().trim();
            if (no.toUpperCase().startsWith("<A HREF=")) {
                file1 = no.substring(9, no.length() - 6);
            }
        }
        return file1;
    }
}