Newer
Older
talkCrawler / src / talkcrawler / DailyIndex.java
  1. package talkcrawler;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.IOException;
  5. import java.io.InputStream;
  6. import java.io.InputStreamReader;
  7. import java.net.HttpURLConnection;
  8. import java.net.URL;
  9. import java.util.StringTokenizer;
  10. import java.util.logging.Level;
  11. import java.util.logging.Logger;
  12.  
  13. public class DailyIndex {
  14. String path;
  15. String dir;
  16. String file;
  17. /**
  18. *
  19. * @param path
  20. * @param dir
  21. * @param file
  22. * @throws java.io.IOException
  23. */
  24. public DailyIndex(String path, String dir, String file) throws IOException {
  25. this.path = path;
  26. this.dir = dir;
  27. this.file = file;
  28. }
  29.  
  30. @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
  31. public void load() {
  32. boolean euc = false;
  33. try {
  34. URL url = new URL(String.format("%s/%s/%s", path, dir, file));
  35. HttpURLConnection http = (HttpURLConnection)url.openConnection();
  36. http.setRequestMethod("GET");
  37. http.connect();
  38. try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
  39. String line;
  40. while((line = rd.readLine()) != null) {
  41. String str = line.trim().toUpperCase();
  42. if (str.startsWith("<META ")) {
  43. if (str.contains("CHARSET=EUC-JP")) {
  44. euc = true;
  45. break;
  46. }
  47. }
  48. if (str.startsWith("<BODY")) {
  49. break;
  50. }
  51. }
  52. if (!euc) {
  53. getLi(rd);
  54. }
  55. }
  56. }
  57. catch (Exception ex) {
  58. Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
  59. return;
  60. }
  61.  
  62. try {
  63. if (euc) {
  64. URL url = new URL(String.format("%s/%s/%s", path, dir, file));
  65. HttpURLConnection http = (HttpURLConnection)url.openConnection();
  66. http.setRequestMethod("GET");
  67. http.connect();
  68. try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "EUC-JP"))) {
  69. getLi(rd);
  70. }
  71. }
  72. }
  73. catch (Exception ex) {
  74. Logger.getLogger(DailyIndex.class.getName()).log(Level.SEVERE, null, ex);
  75. return;
  76. }
  77. }
  78. void getLi(BufferedReader reader) throws IOException {
  79. String line;
  80. String title = "";
  81. String id = "";
  82. String name = "";
  83. while((line = reader.readLine()) != null) {
  84. String str = line.trim();
  85. if (str.toUpperCase().startsWith("<LI>")) {
  86. String str1 = str.substring(4); // 4 <-- length('<LI>')
  87. title = getTitle(str1);
  88. id = getId(str1);
  89. }
  90. if (str.toUpperCase().startsWith("<I>")) {
  91. name = str.substring(3);
  92. }
  93. if (str.equals("</I>")) {
  94. System.out.println("-----");
  95. (new Article(MonthlyIndex.MONTHLY_INDEX, dir, id)).load();
  96. }
  97. }
  98. }
  99. String getTitle(String str) throws IOException {
  100. String title = "";
  101. StringTokenizer st = new StringTokenizer(str, ">");
  102. if (st.hasMoreTokens()) {
  103. String no = st.nextToken().trim();
  104. }
  105. if (st.hasMoreTokens()) {
  106. title = st.nextToken().trim();
  107. }
  108. return title;
  109. }
  110. String getId(String str) throws IOException {
  111. String file = "";
  112. StringTokenizer st = new StringTokenizer(str, ">");
  113. if (st.hasMoreTokens()) {
  114. String no = st.nextToken().trim();
  115. if (no.toUpperCase().startsWith("<A HREF=")) {
  116. file = no.substring(9, no.length() - 6);
  117. }
  118. }
  119. return file;
  120. }
  121. }