package talkcrawler; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Locale; import java.util.StringTokenizer; import java.util.TimeZone; import java.util.logging.Level; import java.util.logging.Logger; import tool.http.Post; public class DailyIndex { String path; String dir; String file; TalkCrawlerProperties prop; /** * COMMAND * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { Calendar calendar = Calendar.getInstance(); calendar.setTime(new Date()); calendar.add(Calendar.DAY_OF_MONTH, -1); DateFormat df = new SimpleDateFormat("yyyy-MMMMMMMM", Locale.UK); df.setTimeZone(TimeZone.getTimeZone("GMT")); String monthly = df.format(calendar.getTime()); TalkCrawlerProperties prop = new TalkCrawlerProperties().load(); DailyIndex ins = new DailyIndex(prop, monthly, "date.html"); ins.load(); } /** * * @param prop MonthlyIndex.MONTHLY_INDEX * @param dir * @param file * @throws java.io.IOException */ public DailyIndex(TalkCrawlerProperties prop, String dir, String file) throws IOException { this.prop = prop; this.path = prop.getProperty("MONTHLY_INDEX"); this.dir = dir; this.file = file; } @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"}) public void load() { boolean euc = false; try { URL url = new URL(String.format("%s/%s/%s", path, dir, file)); HttpURLConnection http = (HttpURLConnection)url.openConnection(); http.setRequestMethod("GET"); http.connect(); try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) { String line; while((line = rd.readLine()) != null) { String str = line.trim().toUpperCase(); if (str.startsWith("<META ")) { if (str.contains("CHARSET=EUC-JP")) { euc = true; break; } } if (str.startsWith("<BODY")) { break; } } if (!euc) { getLi(rd); } } } catch (Exception ex) { Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex); return; } try { if (euc) { URL url = new URL(String.format("%s/%s/%s", path, dir, file)); HttpURLConnection http = (HttpURLConnection)url.openConnection(); http.setRequestMethod("GET"); http.connect(); try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) { getLi(rd); } } } catch (Exception ex) { Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex); } } void getLi(BufferedReader reader) throws IOException { String line; String title = ""; String id = ""; String name = ""; boolean datain = false; while((line = reader.readLine()) != null) { String str = line.trim(); if (str.toUpperCase().startsWith("<b>記事数:</b>")) { datain = true; } if (str.toUpperCase().startsWith("<LI>")) { String str1 = str.substring(4); // 4 <-- length('<LI>') title = getTitle(str1); id = getId(str1); } if (str.toUpperCase().startsWith("<I>")) { name = str.substring(3); } if (str.equals("</I>")) { System.out.println("-----"); Article article = new Article(path, dir, id); article.load(); new Post(prop).post(Article.toJsonArray(article.toJsonObject())); } if (datain && str.toUpperCase().startsWith("</UL>")) { break; } } } String getTitle(String str) throws IOException { String title = ""; StringTokenizer st = new StringTokenizer(str, ">"); if (st.hasMoreTokens()) { String no = st.nextToken().trim(); } if (st.hasMoreTokens()) { title = st.nextToken().trim(); } return title; } String getId(String str) throws IOException { String file1 = ""; StringTokenizer st = new StringTokenizer(str, ">"); if (st.hasMoreTokens()) { String no = st.nextToken().trim(); if (no.toUpperCase().startsWith("<A HREF=")) { file1 = no.substring(9, no.length() - 6); } } return file1; } }