package talkcrawler; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintStream; import java.net.HttpURLConnection; import java.net.URL; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.StringTokenizer; import java.util.logging.Level; import java.util.logging.Logger; import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonArrayBuilder; import javax.json.JsonObject; import javax.json.JsonObjectBuilder; import tool.json.JsonTool; public class Article { String url; String path; String dir; String id; String title = null; Date date = null; String name = null; ArrayList<String> contents = new ArrayList<>(); SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss' UTC'"); @SuppressWarnings({"UseSpecificCatch", "CallToPrintStackTrace"}) public static void main(String[] args) { try { Article ins = new Article(MonthlyIndex.MONTHLY_INDEX, "2019-January", "010424"); ins.load(); Article.post(Article.toJsonArray(ins.toJsonObject())); } catch (Exception e) { e.printStackTrace(); } } /** * * @param path * @param dir * @param id * @throws java.io.IOException */ public Article(String path, String dir, String id) throws IOException { this.path = path; this.dir = dir; this.id = id; this.url = String.format("%s/%s/%s.html", path, dir, id); } public static void post(JsonArray array) { String jsonText = array.toString(); HttpURLConnection con = null; try { URL url = new URL("http://172.17.0.1:8983/solr/talkja/update/json?commit=true"); con = (HttpURLConnection) url.openConnection(); con.setConnectTimeout(60000); con.setReadTimeout(60000); con.addRequestProperty("Content-Type", "application/json; charset=UTF-8"); con.setRequestMethod("POST"); con.setDoOutput(true); con.setDoInput(true); con.connect(); try (OutputStream outputStream = con.getOutputStream()) { try (PrintStream ps = new PrintStream(outputStream)) { ps.print(jsonText); } } // recv response int statusCode = con.getResponseCode(); String responseData = ""; StringBuilder sb = new StringBuilder(); try (InputStream stream = con.getInputStream()) { String line = ""; BufferedReader br = new BufferedReader(new InputStreamReader(stream, "UTF-8")); while ((line = br.readLine()) != null) { sb.append(line); } } responseData = sb.toString(); JsonObject res = JsonTool.parse(responseData); if (res != null) { JsonObject header = res.getJsonObject("responseHeader"); if (header != null) { int status = header.getInt("status"); if (status == 0) { System.out.println(" +++++ "+ status); } else { System.out.println(" xxx "+ status); } } } } catch (Exception ex) { Logger.getLogger(Article.class.getName()).log(Level.SEVERE, null, ex); } finally { if (con != null) { con.disconnect(); } } } JsonObject toJsonObject() { JsonObjectBuilder json = Json.createObjectBuilder(); if (this.path != null) { json.add("path", path); } if (this.dir != null) { json.add("dir", dir); } if (this.url != null) { json.add("url", url); } if (this.title != null) { json.add("title", title); } if (this.id != null) { json.add("id", id); } if (this.name != null) { json.add("name", name); } if (this.date != null) { json.add("date", sdf2.format(date)); } JsonArrayBuilder arryBuild = Json.createArrayBuilder(); for (String content : this.contents) { arryBuild.add(content); } json.add("contents", arryBuild); return json.build(); } static JsonArray toJsonArray(JsonObject obj) { JsonArrayBuilder arryBuild = Json.createArrayBuilder(); arryBuild.add(obj); return arryBuild.build(); } @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"}) public void load() { boolean euc = false; try { URL url = new URL(String.format("%s/%s/%s.html", path, dir, id)); HttpURLConnection http = (HttpURLConnection)url.openConnection(); http.setRequestMethod("GET"); http.connect(); try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) { String line; while((line = rd.readLine()) != null) { String str = line.trim().toUpperCase(); if (str.startsWith("<META ")) { if (str.contains("CHARSET=EUC-JP")) { euc = true; break; } if (str.contains("CHARSET=UTF-8")) { euc = false; break; } } if (str.startsWith("<BODY")) { break; } } if (!euc) { getLi(rd); } } } catch (Exception ex) { Logger.getLogger(Article.class.getName()).log(Level.SEVERE, null, ex); return; } try { if (euc) { URL url = new URL(String.format("%s/%s/%s", path, dir, id)); HttpURLConnection http = (HttpURLConnection)url.openConnection(); http.setRequestMethod("GET"); http.connect(); try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) { getLi(rd); } } } catch (Exception ex) { Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex); return; } } void getLi(BufferedReader reader) throws IOException { SimpleDateFormat sdf1 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' UTC</I>'"); String line; while((line = reader.readLine()) != null) { String str = line.trim(); if (str.startsWith("<H1>[OSM-ja] ")) { title = str.substring(13, str.length() - 5); // 13 <-- length('<H1>[OSM-ja] ') } if (str.startsWith("<I>") && str.endsWith("</I>")) { String dateStr = str.substring(3, str.length() - 4); StringTokenizer st = new StringTokenizer(str, "()"); if (st.hasMoreTokens()) { String date1 = st.nextToken().trim(); st.nextToken(); String date3 = st.nextToken().trim(); try { date = sdf1.parse(date1 + date3); } catch(ParseException e) { e.printStackTrace(); } } } if (str.startsWith("<B>") && str.endsWith("</B>")) { name = str.substring(3, str.length() - 4); } if (str.equals("<!--beginarticle-->")) { getArticle(reader); } } } /* public void post(JsonObject obj) { JsonArray arry = Json.createArrayBuilder().add(obj).build(); System.out.println(arry.toString()); } */ void getArticle(BufferedReader reader) throws IOException { String line; boolean top = true; while((line = reader.readLine()) != null) { if (line.equals("<!--endarticle-->")) { return; } if (line.startsWith("</PRE>")) { return; } String str = line; if (line.startsWith("<PRE>") && top) { str = line.substring(5); top = false; } this.contents.add(str); } } }