- package talkcrawler;
-
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import java.util.StringTokenizer;
- import java.util.logging.Level;
- import java.util.logging.Logger;
-
- public class DailyIndex {
- String path;
- String dir;
- String file;
-
- /**
- *
- * @param path
- * @param dir
- * @param file
- * @throws java.io.IOException
- */
- public DailyIndex(String path, String dir, String file) throws IOException {
- this.path = path;
- this.dir = dir;
- this.file = file;
- }
-
- @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
- public void load() {
- boolean euc = false;
- try {
- URL url = new URL(String.format("%s/%s/%s", path, dir, file));
- HttpURLConnection http = (HttpURLConnection)url.openConnection();
- http.setRequestMethod("GET");
- http.connect();
- try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
- String line;
- while((line = rd.readLine()) != null) {
- String str = line.trim().toUpperCase();
- if (str.startsWith("<META ")) {
- if (str.contains("CHARSET=EUC-JP")) {
- euc = true;
- break;
- }
- }
- if (str.startsWith("<BODY")) {
- break;
- }
- }
- if (!euc) {
- getLi(rd);
- }
- }
-
- }
- catch (Exception ex) {
- Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
- return;
- }
-
- try {
- if (euc) {
- URL url = new URL(String.format("%s/%s/%s", path, dir, file));
- HttpURLConnection http = (HttpURLConnection)url.openConnection();
- http.setRequestMethod("GET");
- http.connect();
- try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) {
- getLi(rd);
- }
- }
- }
- catch (Exception ex) {
- Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
- return;
- }
- }
-
- void getLi(BufferedReader reader) throws IOException {
- String line;
- String title = "";
- String id = "";
- String name = "";
- while((line = reader.readLine()) != null) {
- String str = line.trim();
- if (str.toUpperCase().startsWith("<LI>")) {
- String str1 = str.substring(4); // 4 <-- length('<LI>')
- title = getTitle(str1);
- id = getId(str1);
- }
- if (str.toUpperCase().startsWith("<I>")) {
- name = str.substring(3);
- }
- if (str.equals("</I>")) {
- System.out.println("-----");
- (new Article(MonthlyIndex.MONTHLY_INDEX, dir, id)).load();
- }
- }
- }
-
- String getTitle(String str) throws IOException {
- String title = "";
- StringTokenizer st = new StringTokenizer(str, ">");
- if (st.hasMoreTokens()) {
- String no = st.nextToken().trim();
- }
- if (st.hasMoreTokens()) {
- title = st.nextToken().trim();
- }
- return title;
- }
-
- String getId(String str) throws IOException {
- String file = "";
- StringTokenizer st = new StringTokenizer(str, ">");
- if (st.hasMoreTokens()) {
- String no = st.nextToken().trim();
- if (no.toUpperCase().startsWith("<A HREF=")) {
- file = no.substring(9, no.length() - 6);
- }
- }
- return file;
- }
- }