talkCrawler/src/talkcrawler/DailyIndex.java at 981db26eacf85cdb3b5533bedadae73ef3d74eaf

Fork: 0
haya4 / talkCrawler
Find file
Newer
Older
talkCrawler / src / talkcrawler / DailyIndex.java
yuu on 22 Jan 2019 3 KB talk-jaのサイトからTEXTを抽出するところまで出来た
Raw Blame History
package talkcrawler;
 
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
 
public class DailyIndex {
    String path;
    String dir;
    String file;
    
    /**
     * 
     * @param path
     * @param dir
     * @param file
     * @throws java.io.IOException 
     */
    public DailyIndex(String path, String dir, String file) throws IOException {
        this.path = path;
        this.dir = dir;
        this.file = file;
    }
 
    @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
    public void load() {
        boolean euc = false;
        try {
            URL url = new URL(String.format("%s/%s/%s", path, dir, file));
            HttpURLConnection http = (HttpURLConnection)url.openConnection();
            http.setRequestMethod("GET");
            http.connect();
            try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
                String line;
                while((line = rd.readLine()) != null) {
                    String str = line.trim().toUpperCase();
                    if (str.startsWith("<META ")) {
                        if (str.contains("CHARSET=EUC-JP")) {
                            euc = true;
                            break;
                        }
                    }
                    if (str.startsWith("<BODY")) {
                        break;
                    }
                }
                if (!euc) {
                    getLi(rd);
                }
            }
            
        }
        catch (Exception ex) {
            Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
            return;
        }
 
        try {
            if (euc) {
                URL url = new URL(String.format("%s/%s/%s", path, dir, file));
                HttpURLConnection http = (HttpURLConnection)url.openConnection();
                http.setRequestMethod("GET");
                http.connect();
                try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) {
                    getLi(rd);
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
            return;
        }
    }
    
    void getLi(BufferedReader reader) throws IOException {
        String line;
        String title = "";
        String id = "";
        String name = "";
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.toUpperCase().startsWith("<LI>")) {
                String str1 = str.substring(4);     // 4 <-- length('<LI>')
                title = getTitle(str1);
                id = getId(str1);
            }
            if (str.toUpperCase().startsWith("<I>")) {
                name = str.substring(3);
            }
            if (str.equals("</I>")) {
                System.out.println("-----");
                (new Article(MonthlyIndex.MONTHLY_INDEX, dir, id)).load();
            }
        }
    }
    
    String getTitle(String str) throws IOException {
        String title = "";
        StringTokenizer st = new StringTokenizer(str, ">");
        if (st.hasMoreTokens()) {
            String no = st.nextToken().trim();
        }
        if (st.hasMoreTokens()) {
            title = st.nextToken().trim();
        }
        return title;
    }
    
    String getId(String str) throws IOException {
        String file = "";
        StringTokenizer st = new StringTokenizer(str, ">");
        if (st.hasMoreTokens()) {
            String no = st.nextToken().trim();
            if (no.toUpperCase().startsWith("<A HREF=")) {
                file = no.substring(9, no.length() - 6);
            }
        }
        return file;
    }
}