package talkcrawler;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObject;
import javax.json.JsonObjectBuilder;
public class Article {
String url;
String path;
String dir;
String id;
String title = null;
Date date = null;
String name = null;
ArrayList<String> contents = new ArrayList<>();
SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss' UTC'");
/**
*
* @param path
* @param dir
* @param id
* @throws java.io.IOException
*/
public Article(String path, String dir, String id) throws IOException {
this.path = path;
this.dir = dir;
this.id = id;
this.url = String.format("%s/%s/%s.html", path, dir, id);
}
JsonObject toJsonObject() {
JsonObjectBuilder json = Json.createObjectBuilder();
if (this.path != null) {
json.add("path", path);
}
if (this.dir != null) {
json.add("dir", dir);
}
if (this.url != null) {
json.add("url", url);
}
if (this.title != null) {
json.add("title", title);
}
if (this.id != null) {
json.add("id", id);
}
if (this.name != null) {
json.add("name", name);
}
if (this.date != null) {
json.add("date", sdf2.format(date));
}
JsonArrayBuilder arryBuild = Json.createArrayBuilder();
for (String content : this.contents) {
arryBuild.add(content);
}
json.add("contents", arryBuild);
return json.build();
}
static JsonArray toJsonArray(JsonObject obj) {
JsonArrayBuilder arryBuild = Json.createArrayBuilder();
arryBuild.add(obj);
return arryBuild.build();
}
@SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
public void load() {
boolean euc = false;
try {
URL aurl = new URL(String.format("%s/%s/%s.html", path, dir, id));
HttpURLConnection http = (HttpURLConnection)aurl.openConnection();
http.setRequestMethod("GET");
http.connect();
try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
String line;
while((line = rd.readLine()) != null) {
String str = line.trim().toUpperCase();
if (str.startsWith("<META ")) {
if (str.contains("CHARSET=EUC-JP")) {
euc = true;
break;
}
if (str.contains("CHARSET=UTF-8")) {
euc = false;
break;
}
}
if (str.startsWith("<BODY")) {
break;
}
}
if (!euc) {
getLi(rd);
}
}
}
catch (Exception ex) {
Logger.getLogger(Article.class.getName()).log(Level.SEVERE, null, ex);
return;
}
try {
if (euc) {
URL aurl = new URL(String.format("%s/%s/%s.html", path, dir, id));
HttpURLConnection http = (HttpURLConnection)aurl.openConnection();
http.setRequestMethod("GET");
http.connect();
try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) {
getLi(rd);
}
}
}
catch (Exception ex) {
Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
}
}
void getLi(BufferedReader reader) throws IOException {
SimpleDateFormat sdf1 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' UTC</I>'");
String line;
while((line = reader.readLine()) != null) {
String str = line.trim();
if (str.startsWith("<H1>[OSM-ja] ")) {
title = str.substring(13, str.length() - 5); // 13 <-- length('<H1>[OSM-ja] ')
}
if (str.startsWith("<I>") && str.endsWith("</I>")) {
System.out.println(this.id);
StringTokenizer st = new StringTokenizer(str, "()");
if (st.hasMoreTokens()) {
String date1 = st.nextToken().trim();
st.nextToken();
String date3 = st.nextToken().trim();
try {
date = sdf1.parse(date1 + date3);
}
catch(ParseException e) {
try {
SimpleDateFormat sdf3 = new SimpleDateFormat("'<I>'yyyy'年 'M'月 'd'日'HH:mm:ss' GMT</I>'");
date = sdf3.parse(date1 + date3);
}
catch (ParseException e2) {
throw new IOException(e2);
}
}
}
}
if (str.startsWith("<B>") && str.endsWith("</B>")) {
name = str.substring(3, str.length() - 4);
}
if (str.equals("<!--beginarticle-->")) {
getArticle(reader);
}
}
}
void getArticle(BufferedReader reader) throws IOException {
String line;
boolean top = true;
while((line = reader.readLine()) != null) {
if (line.equals("<!--endarticle-->")) {
return;
}
if (line.startsWith("</PRE>")) {
return;
}
String str = line;
if (line.startsWith("<PRE>") && top) {
str = line.substring(5);
top = false;
}
this.contents.add(str);
}
}
}