package talkcrawler; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.StringTokenizer; import java.util.logging.Level; import java.util.logging.Logger; import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonArrayBuilder; import javax.json.JsonObject; import javax.json.JsonObjectBuilder; public class Article { String url; String path; String dir; String id; String title = null; Date date = null; String name = null; ArrayList<String> contents = new ArrayList<>(); SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss' UTC'"); /** * * @param path * @param dir * @param id * @throws java.io.IOException */ public Article(String path, String dir, String id) throws IOException { this.path = path; this.dir = dir; this.id = id; this.url = String.format("%s/%s/%s.html", path, dir, id); } JsonObject toJsonObject() { JsonObjectBuilder json = Json.createObjectBuilder(); if (this.path != null) { json.add("path", path); } if (this.dir != null) { json.add("dir", dir); } if (this.url != null) { json.add("url", url); } if (this.title != null) { json.add("title", title); } if (this.id != null) { json.add("id", id); } if (this.name != null) { json.add("name", name); } if (this.date != null) { json.add("date", sdf2.format(date)); } JsonArrayBuilder arryBuild = Json.createArrayBuilder(); for (String content : this.contents) { arryBuild.add(content); } json.add("contents", arryBuild); return json.build(); } static JsonArray toJsonArray(JsonObject obj) { JsonArrayBuilder arryBuild = Json.createArrayBuilder(); arryBuild.add(obj); return arryBuild.build(); } @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"}) public void load() { boolean euc = false; try { URL aurl = new URL(String.format("%s/%s/%s.html", path, dir, id)); HttpURLConnection http = (HttpURLConnection)aurl.openConnection(); http.setRequestMethod("GET"); http.connect(); try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) { String line; while((line = rd.readLine()) != null) { String str = line.trim().toUpperCase(); if (str.startsWith("<META ")) { if (str.contains("CHARSET=EUC-JP")) { euc = true; break; } if (str.contains("CHARSET=UTF-8")) { euc = false; break; } } if (str.startsWith("<BODY")) { break; } } if (!euc) { getLi(rd); } } } catch (Exception ex) { Logger.getLogger(Article.class.getName()).log(Level.SEVERE, null, ex); return; } try { if (euc) { URL aurl = new URL(String.format("%s/%s/%s.html", path, dir, id)); HttpURLConnection http = (HttpURLConnection)aurl.openConnection(); http.setRequestMethod("GET"); http.connect(); try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) { getLi(rd); } } } catch (Exception ex) { Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex); } } void getLi(BufferedReader reader) throws IOException { SimpleDateFormat sdf1 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' UTC</I>'"); String line; while((line = reader.readLine()) != null) { String str = line.trim(); if (str.startsWith("<H1>[OSM-ja] ")) { title = str.substring(13, str.length() - 5); // 13 <-- length('<H1>[OSM-ja] ') } if (str.startsWith("<I>") && str.endsWith("</I>")) { System.out.println(this.id); StringTokenizer st = new StringTokenizer(str, "()"); if (st.hasMoreTokens()) { String date1 = st.nextToken().trim(); st.nextToken(); String date3 = st.nextToken().trim(); try { date = sdf1.parse(date1 + date3); } catch(ParseException e) { try { SimpleDateFormat sdf3 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' GMT</I>'"); date = sdf3.parse(date1 + date3); } catch (ParseException e2) { throw new IOException(e2); } } } } if (str.startsWith("<B>") && str.endsWith("</B>")) { name = str.substring(3, str.length() - 4); } if (str.equals("<!--beginarticle-->")) { getArticle(reader); } } } void getArticle(BufferedReader reader) throws IOException { String line; boolean top = true; while((line = reader.readLine()) != null) { if (line.equals("<!--endarticle-->")) { return; } if (line.startsWith("</PRE>")) { return; } String str = line; if (line.startsWith("<PRE>") && top) { str = line.substring(5); top = false; } this.contents.add(str); } } }