Newer
Older
talkCrawler / src / talkcrawler / Article.java
package talkcrawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObject;
import javax.json.JsonObjectBuilder;

public class Article {
    String url;
    String path;
    String dir;
    String id;
    String title = null;
    Date date = null;
    String name = null;
    ArrayList<String> contents = new ArrayList<>(); 

    SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss' UTC'");
    
    /**
     * 
     * @param path
     * @param dir
     * @param id
     * @throws java.io.IOException 
     */
    public Article(String path, String dir, String id) throws IOException {
        this.path = path;
        this.dir = dir;
        this.id = id;
        this.url = String.format("%s/%s/%s.html", path, dir, id);
    }
    
    JsonObject toJsonObject() {
        JsonObjectBuilder json = Json.createObjectBuilder();
        if (this.path != null) {
            json.add("path", path);
        }
        if (this.dir != null) {
            json.add("dir", dir);
        }
        if (this.url != null) {
            json.add("url", url);
        }

        if (this.title != null) {
            json.add("title", title);
        }
        if (this.id != null) {
            json.add("id", id);
        }
        if (this.name != null) {
            json.add("name", name);
        }
        if (this.date != null) {
            json.add("date", sdf2.format(date));
        }
        JsonArrayBuilder arryBuild = Json.createArrayBuilder();
        for (String content : this.contents) {
            arryBuild.add(content);
        }
        json.add("contents", arryBuild);
        return json.build();
    }
    
    static JsonArray toJsonArray(JsonObject obj) {
        JsonArrayBuilder arryBuild = Json.createArrayBuilder();
        arryBuild.add(obj);
        return arryBuild.build();
    }
    
    @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
    public void load() {
        boolean euc = false;
        try {
            URL aurl = new URL(String.format("%s/%s/%s.html", path, dir, id));
            HttpURLConnection http = (HttpURLConnection)aurl.openConnection();
            http.setRequestMethod("GET");
            http.connect();
            try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
                String line;
                while((line = rd.readLine()) != null) {
                    String str = line.trim().toUpperCase();
                    if (str.startsWith("<META ")) {
                        if (str.contains("CHARSET=EUC-JP")) {
                            euc = true;
                            break;
                        }
                        if (str.contains("CHARSET=UTF-8")) {
                            euc = false;
                            break;
                        }
                    }
                    if (str.startsWith("<BODY")) {
                        break;
                    }
                }
                if (!euc) {
                    getLi(rd);
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(Article.class.getName()).log(Level.SEVERE, null, ex);
            return;
        }
        
        try {
            if (euc) {
                URL aurl = new URL(String.format("%s/%s/%s.html", path, dir, id));
                HttpURLConnection http = (HttpURLConnection)aurl.openConnection();
                http.setRequestMethod("GET");
                http.connect();
                try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) {
                    getLi(rd);
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    
    void getLi(BufferedReader reader) throws IOException {
        SimpleDateFormat sdf1 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' UTC</I>'");
        
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.startsWith("<H1>[OSM-ja] ")) {
                title = str.substring(13, str.length() - 5);     // 13 <-- length('<H1>[OSM-ja] ')
            }
            if (str.startsWith("<I>") && str.endsWith("</I>")) {
                System.out.println(this.id);
                
                StringTokenizer st = new StringTokenizer(str, "()");
                if (st.hasMoreTokens()) {
                    String date1 = st.nextToken().trim();
                    st.nextToken();
                    String date3 = st.nextToken().trim();
                    try {
                        date = sdf1.parse(date1 + date3);
                    }
                    catch(ParseException e) {
                        try {
                            SimpleDateFormat sdf3 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' GMT</I>'");
                            date = sdf3.parse(date1 + date3);
                        }
                        catch (ParseException e2) {
                            throw new IOException(e2);
                        }
                    }
                }
            }
            if (str.startsWith("<B>") && str.endsWith("</B>")) {
                name = str.substring(3, str.length() - 4);
            }
            if (str.equals("<!--beginarticle-->")) {
                getArticle(reader);
            }
        }
    }
    
    void getArticle(BufferedReader reader) throws IOException {
        String line;
        boolean top = true;
        while((line = reader.readLine()) != null) {
            if (line.equals("<!--endarticle-->")) {
                return;
            }
            if (line.startsWith("</PRE>")) {
                return;
            }
            String str = line;
            if (line.startsWith("<PRE>") && top) {
                str = line.substring(5);
                top = false;
            }
            this.contents.add(str);
        }
    }
}