Newer
Older
talkCrawler / src / talkcrawler / Article.java
@haya4 haya4 on 10 Mar 2019 8 KB ok
package talkcrawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObject;
import javax.json.JsonObjectBuilder;
import tool.json.JsonTool;

public class Article {
    String url;
    String path;
    String dir;
    String id;
    String title = null;
    Date date = null;
    String name = null;
    ArrayList<String> contents = new ArrayList<>(); 

    SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss' UTC'");
    
    @SuppressWarnings({"UseSpecificCatch", "CallToPrintStackTrace"})
    public static void main(String[] args) {
        try {
            Article ins = new Article(MonthlyIndex.MONTHLY_INDEX, "2019-January", "010424");
            ins.load();
            Article.post(Article.toJsonArray(ins.toJsonObject()));
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    /**
     * 
     * @param path
     * @param dir
     * @param id
     * @throws java.io.IOException 
     */
    public Article(String path, String dir, String id) throws IOException {
        this.path = path;
        this.dir = dir;
        this.id = id;
        this.url = String.format("%s/%s/%s.html", path, dir, id);
    }

    public static void post(JsonArray array) {
        String jsonText = array.toString();
        HttpURLConnection con = null;
        try {
            URL url = new URL("http://172.17.0.1:8983/solr/talkja/update/json?commit=true");
            con = (HttpURLConnection) url.openConnection();
            con.setConnectTimeout(60000);
            con.setReadTimeout(60000);
            con.addRequestProperty("Content-Type", "application/json; charset=UTF-8");
            con.setRequestMethod("POST");
            con.setDoOutput(true);
            con.setDoInput(true);
            con.connect();
            try (OutputStream outputStream = con.getOutputStream()) {
                try (PrintStream ps = new PrintStream(outputStream)) {
                    ps.print(jsonText);                
                }
            }
            
            // recv response
            int statusCode = con.getResponseCode();
            String responseData = "";
            StringBuilder sb = new StringBuilder();
            try (InputStream stream = con.getInputStream()) {
                String line = "";
                BufferedReader br = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
                while ((line = br.readLine()) != null) {
                    sb.append(line);
                }
            }
            responseData = sb.toString();
            JsonObject res = JsonTool.parse(responseData);
            if (res != null) {
                JsonObject header = res.getJsonObject("responseHeader");
                if (header != null) {
                    int status = header.getInt("status");
                    if (status == 0) {
                        System.out.println("     +++++ "+ status);
                    }
                    else {
                        System.out.println("     xxx "+ status);
                    }
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(Article.class.getName()).log(Level.SEVERE, null, ex);
        }
        finally {
            if (con != null) {
                con.disconnect();
            }
        }
    }
    
    JsonObject toJsonObject() {
        JsonObjectBuilder json = Json.createObjectBuilder();
        if (this.path != null) {
            json.add("path", path);
        }
        if (this.dir != null) {
            json.add("dir", dir);
        }
        if (this.url != null) {
            json.add("url", url);
        }

        if (this.title != null) {
            json.add("title", title);
        }
        if (this.id != null) {
            json.add("id", id);
        }
        if (this.name != null) {
            json.add("name", name);
        }
        if (this.date != null) {
            json.add("date", sdf2.format(date));
        }
        JsonArrayBuilder arryBuild = Json.createArrayBuilder();
        for (String content : this.contents) {
            arryBuild.add(content);
        }
        json.add("contents", arryBuild);
        return json.build();
    }
    
    static JsonArray toJsonArray(JsonObject obj) {
        JsonArrayBuilder arryBuild = Json.createArrayBuilder();
        arryBuild.add(obj);
        return arryBuild.build();
    }
    
    @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
    public void load() {
        boolean euc = false;
        try {
            URL url = new URL(String.format("%s/%s/%s.html", path, dir, id));
            HttpURLConnection http = (HttpURLConnection)url.openConnection();
            http.setRequestMethod("GET");
            http.connect();
            try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
                String line;
                while((line = rd.readLine()) != null) {
                    String str = line.trim().toUpperCase();
                    if (str.startsWith("<META ")) {
                        if (str.contains("CHARSET=EUC-JP")) {
                            euc = true;
                            break;
                        }
                        if (str.contains("CHARSET=UTF-8")) {
                            euc = false;
                            break;
                        }
                    }
                    if (str.startsWith("<BODY")) {
                        break;
                    }
                }
                if (!euc) {
                    getLi(rd);
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(Article.class.getName()).log(Level.SEVERE, null, ex);
            return;
        }
        
        try {
            if (euc) {
                URL url = new URL(String.format("%s/%s/%s", path, dir, id));
                HttpURLConnection http = (HttpURLConnection)url.openConnection();
                http.setRequestMethod("GET");
                http.connect();
                try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) {
                    getLi(rd);
                }
            }
        }
        catch (Exception ex) {
            Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
            return;
        }
    }
    
    void getLi(BufferedReader reader) throws IOException {
        SimpleDateFormat sdf1 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' UTC</I>'");
        
        String line;
        while((line = reader.readLine()) != null) {
            String str = line.trim();
            if (str.startsWith("<H1>[OSM-ja] ")) {
                title = str.substring(13, str.length() - 5);     // 13 <-- length('<H1>[OSM-ja] ')
            }
            if (str.startsWith("<I>") && str.endsWith("</I>")) {
                String dateStr = str.substring(3, str.length() - 4);
                
                StringTokenizer st = new StringTokenizer(str, "()");
                if (st.hasMoreTokens()) {
                    String date1 = st.nextToken().trim();
                    st.nextToken();
                    String date3 = st.nextToken().trim();
                    try {
                        date = sdf1.parse(date1 + date3);
                    }
                    catch(ParseException e) {
                        e.printStackTrace();
                    }
                }
                
            }
            if (str.startsWith("<B>") && str.endsWith("</B>")) {
                name = str.substring(3, str.length() - 4);
            }
            if (str.equals("<!--beginarticle-->")) {
                getArticle(reader);
            }
        }
    }
    
    /*
    public void post(JsonObject obj) {
        JsonArray arry = Json.createArrayBuilder().add(obj).build();
        System.out.println(arry.toString());
    }
    */
    
    void getArticle(BufferedReader reader) throws IOException {
        String line;
        boolean top = true;
        while((line = reader.readLine()) != null) {
            if (line.equals("<!--endarticle-->")) {
                return;
            }
            if (line.startsWith("</PRE>")) {
                return;
            }
            String str = line;
            if (line.startsWith("<PRE>") && top) {
                str = line.substring(5);
                top = false;
            }
            this.contents.add(str);
        }
    }
}