diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..942e6bb --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,37 @@ + +The MIT License (MIT) + +Copyright (c) 2019 Yuu Hayashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +------------------------------------------------------------------- +`talkCrawler.jar` +Copyright (c) 2019 Yuu Hayashi +This software is released under the MIT License, see LICENSE.txt. + +------------------------------------------------------------------- + +* [hayashi.jar](https://osdn.net/projects/hayashilib/) +Copyright (c) 2013 Yuu Hayashi +This software is released under the MIT License. + +* [javax.json.jar](https://javaee.github.io/jsonp/) + +* [postgresql-42.2.4.jar](https://www.postgresql.org/) diff --git a/README.md b/README.md new file mode 100644 index 0000000..780508f --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# talkCrawler + +## Summary + +[Talk-ja -- OpenStreetMap Japanese talk](https://lists.openstreetmap.org/listinfo/talk-ja) の過去の記事をクロールして, + ローカルデータベースに収集する。 + +* [talkCrawler](doc/talkCrawler.md) <-- 詳細はこちら + + +## License + +* [MIT license](LICENSE.txt) diff --git a/build.xml b/build.xml new file mode 100644 index 0000000..665d866 --- /dev/null +++ b/build.xml @@ -0,0 +1,73 @@ + + + + + + + + + + + Builds, tests, and runs the project talkCrawler. + + + diff --git a/doc/talkCrawler.md b/doc/talkCrawler.md new file mode 100644 index 0000000..0486b6e --- /dev/null +++ b/doc/talkCrawler.md @@ -0,0 +1,35 @@ +# talkCrawler + + +## Summary + +[Talk-ja -- OpenStreetMap Japanese talk](https://lists.openstreetmap.org/listinfo/talk-ja) の過去の記事をクロールして, + ローカルデータベースに収集する。 + + +## Development + +* Used by NetBeans 8.2 +* OpenJDK 8 + +## External library + +* [hayashi.jar](https://osdn.net/projects/hayashilib/releases/) - hayashi lib +* [javax.json.jar](https://javaee.github.io/jsonp/) - JSON parser +* [commons-compress-1.18.jar](https://commons.apache.org/proper/commons-compress/download_compress.cgi) - Apache Commons Compress +* [postgresql-42.2.4.jar](https://www.postgresql.org/) - PostgeSQL JDBC driver + +### build + +### Development environment + +![MachineStructure](machinestructure.png) + +1. curl[monthly index](https://lists.openstreetmap.org/pipermail/talk-ja/) を取得する +2. +3. 生成された「統計データ」と「ベクタタイル」を GitHub に プッシュ(push) する +4. ついでに、`Raspi3`にある「公開用のPostGIS」にも「データ」をコピーする + + +## 処理 + diff --git a/manifest.mf b/manifest.mf new file mode 100644 index 0000000..328e8e5 --- /dev/null +++ b/manifest.mf @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +X-COMMENT: Main-Class will be added automatically by build + diff --git a/src/talkcrawler/Article.java b/src/talkcrawler/Article.java new file mode 100644 index 0000000..736184c --- /dev/null +++ b/src/talkcrawler/Article.java @@ -0,0 +1,165 @@ +package talkcrawler; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.StringTokenizer; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class Article { + String path; + String dir; + String id; + + @SuppressWarnings({"UseSpecificCatch", "CallToPrintStackTrace"}) + public static void main(String[] args) { + try { + Article ins = new Article(MonthlyIndex.MONTHLY_INDEX, "2019-January", "010424"); + ins.load(); + } + catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * + * @param path + * @param dir + * @param id + * @throws java.io.IOException + */ + public Article(String path, String dir, String id) throws IOException { + this.path = path; + this.dir = dir; + this.id = id; + } + + @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"}) + public void load() { + boolean euc = false; + try { + URL url = new URL(String.format("%s/%s/%s.html", path, dir, id)); + HttpURLConnection http = (HttpURLConnection)url.openConnection(); + http.setRequestMethod("GET"); + http.connect(); + try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) { + String line; + while((line = rd.readLine()) != null) { + String str = line.trim().toUpperCase(); + if (str.startsWith("'yyyy'年 'M'月 'd'日'HH:mm:ss' UTC'"); + SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss' UTC'"); + + String line; + String title = ""; + Date date = null; + String name = ""; + while((line = reader.readLine()) != null) { + String str = line.trim(); + if (str.startsWith("

[OSM-ja] ")) { + title = str.substring(13, str.length() - 5); // 13 <-- length('

[OSM-ja] ') + } + if (str.startsWith("") && str.endsWith("")) { + String dateStr = str.substring(3, str.length() - 4); + + StringTokenizer st = new StringTokenizer(str, "()"); + if (st.hasMoreTokens()) { + String date1 = st.nextToken().trim(); + st.nextToken(); + String date3 = st.nextToken().trim(); + try { + date = sdf1.parse(date1 + date3); + } + catch(ParseException e) { + e.printStackTrace(); + } + } + + } + if (str.startsWith("") && str.endsWith("")) { + name = str.substring(3, str.length() - 4); + } + if (str.equals("")) { + getArticle(reader); + + System.out.println("{"); + System.out.println(title); + System.out.println(id); + System.out.println(sdf2.format(date)); + System.out.println(name); + System.out.println("}"); + title = ""; + id = ""; + name = ""; + date = null; + } + } + } + + void getArticle(BufferedReader reader) throws IOException { + String line; + boolean top = true; + while((line = reader.readLine()) != null) { + if (line.equals("")) { + return; + } + if (line.startsWith("")) { + return; + } + String str = line; + if (line.startsWith("
") && top) {
+                str = line.substring(5);
+                top = false;
+            }
+            
+            System.out.println(str);
+        }
+    }
+}
diff --git a/src/talkcrawler/DailyIndex.java b/src/talkcrawler/DailyIndex.java
new file mode 100644
index 0000000..b45a59d
--- /dev/null
+++ b/src/talkcrawler/DailyIndex.java
@@ -0,0 +1,126 @@
+package talkcrawler;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.StringTokenizer;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+public class DailyIndex {
+    String path;
+    String dir;
+    String file;
+    
+    /**
+     * 
+     * @param path
+     * @param dir
+     * @param file
+     * @throws java.io.IOException 
+     */
+    public DailyIndex(String path, String dir, String file) throws IOException {
+        this.path = path;
+        this.dir = dir;
+        this.file = file;
+    }
+
+    @SuppressWarnings({"CallToPrintStackTrace", "SleepWhileInLoop", "UseSpecificCatch"})
+    public void load() {
+        boolean euc = false;
+        try {
+            URL url = new URL(String.format("%s/%s/%s", path, dir, file));
+            HttpURLConnection http = (HttpURLConnection)url.openConnection();
+            http.setRequestMethod("GET");
+            http.connect();
+            try (BufferedReader rd = new BufferedReader(new InputStreamReader(http.getInputStream(), "UTF-8"))) {
+                String line;
+                while((line = rd.readLine()) != null) {
+                    String str = line.trim().toUpperCase();
+                    if (str.startsWith("")) {
+                String str1 = str.substring(4);     // 4 <-- length('
  • ') + title = getTitle(str1); + id = getId(str1); + } + if (str.toUpperCase().startsWith("")) { + name = str.substring(3); + } + if (str.equals("")) { + System.out.println("-----"); + (new Article(MonthlyIndex.MONTHLY_INDEX, dir, id)).load(); + } + } + } + + String getTitle(String str) throws IOException { + String title = ""; + StringTokenizer st = new StringTokenizer(str, ">"); + if (st.hasMoreTokens()) { + String no = st.nextToken().trim(); + } + if (st.hasMoreTokens()) { + title = st.nextToken().trim(); + } + return title; + } + + String getId(String str) throws IOException { + String file = ""; + StringTokenizer st = new StringTokenizer(str, ">"); + if (st.hasMoreTokens()) { + String no = st.nextToken().trim(); + if (no.toUpperCase().startsWith("")) { + in = true; + } + if (in) { + getBody(reader); + } + if (str.toUpperCase().endsWith("")) { + in = false; + } + } + } + + void getBody(BufferedReader reader) throws IOException { + boolean in = false; + String line; + while((line = reader.readLine()) != null) { + String str = line.trim(); + if (str.toUpperCase().startsWith("")) { + in = false; + } + } + } + + void getTable(BufferedReader reader) throws IOException { + boolean in = false; + String line; + while((line = reader.readLine()) != null) { + String str = line.trim(); + if (str.toUpperCase().startsWith("")) { + in = false; + } + } + } + + void getTr(BufferedReader reader) throws IOException { + boolean in = false; + String line; + while((line = reader.readLine()) != null) { + String str = line.trim(); + if (str.toUpperCase().startsWith("")) { + in = false; + } + } + } + + void getA(BufferedReader reader) throws IOException { + String line; + while((line = reader.readLine()) != null) { + String str = line.trim(); + if (str.toUpperCase().startsWith("[ 日付 ]")) { + String path = str.substring(9, str.length() - 12); + String dir = ""; + String name = ""; + StringTokenizer st = new StringTokenizer(path, "/"); + if (st.hasMoreTokens()) { + dir = st.nextToken().trim(); + } + if (st.hasMoreTokens()) { + name = st.nextToken().trim(); + } + + System.out.println(String.format("%s/%s/%s", MONTHLY_INDEX, dir, name)); + DailyIndex daily = new DailyIndex(MONTHLY_INDEX, dir, name); + daily.load(); + } + } + } +} diff --git a/src/talkcrawler/TalkCrawler.java b/src/talkcrawler/TalkCrawler.java new file mode 100644 index 0000000..92b34fc --- /dev/null +++ b/src/talkcrawler/TalkCrawler.java @@ -0,0 +1,15 @@ +package talkcrawler; + +public class TalkCrawler { + + @SuppressWarnings({"UseSpecificCatch", "CallToPrintStackTrace"}) + public static void main(String[] args) { + try { + MonthlyIndex ins = new MonthlyIndex(); + ins.load(); + } + catch (Exception e) { + e.printStackTrace(); + } + } +}