Browse Source

added importer for Gewerkschaftshaus erfurt

Signed-off-by: Stephan Richter <s.richter@srsoftware.de>
main
Stephan Richter 4 months ago
parent
commit
78258860c1
  1. 3
      de.srsoftware.cal.app/src/main/java/de/srsoftware/cal/app/Application.java
  2. 2
      de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/BaseImporter.java
  3. 14
      de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/Util.java
  4. 1
      de.srsoftware.cal.db/src/main/java/de/srsoftware/cal/db/MariaDB.java
  5. 19
      de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/erfurt/Eburg.java
  6. 21
      de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/erfurt/FromHell.java
  7. 229
      de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/erfurt/GewerkschaftshausErfurt.java
  8. 3
      de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/weimar/CKeller.java
  9. 14
      de.srsoftware.cal.importer/src/test/java/RegexTest.java

3
de.srsoftware.cal.app/src/main/java/de/srsoftware/cal/app/Application.java

@ -14,14 +14,13 @@ import de.srsoftware.configuration.Configuration; @@ -14,14 +14,13 @@ import de.srsoftware.configuration.Configuration;
import de.srsoftware.configuration.JsonConfig;
import de.srsoftware.tools.ColorLogger;
import de.srsoftware.tools.plugin.JarWatchdog;
import org.json.JSONObject;
import java.io.File;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.sql.SQLException;
import java.time.Duration;
import java.util.Optional;
import org.json.JSONObject;
/**
* Test application

2
de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/BaseImporter.java

@ -189,7 +189,7 @@ public abstract class BaseImporter implements Importer { @@ -189,7 +189,7 @@ public abstract class BaseImporter implements Importer {
var href = anchor.get(HREF);
if (href == null) return null;
if (!href.contains("://")) href = baseUrl()+href;
var txt = anchor.strip();
var txt = anchor.strip().trim();
return url(Payload.of(href)).optional().map(url -> new Link(url,txt)).orElse(null);
})
.filter(Objects::nonNull)

14
de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/Util.java

@ -41,6 +41,7 @@ public class Util { @@ -41,6 +41,7 @@ public class Util {
public static final String VCALENDAR = "VCALENDAR";
public static final Pattern GERMAN_DATE_PATTERN = Pattern.compile("^\\D*(\\d\\d?)\\.(\\d\\d?)\\.(\\d{4})\\D");
public static final Pattern GERMAN_DATE_WITHOUT_YEAR = Pattern.compile("(\\d\\d?)\\.(\\d\\d?)");
public static final Pattern GERMAN_DATE_PATTERN_LONG = Pattern.compile("(\\d\\d?)\\.?\\s*(\\w+)\\s+(\\d{4})\\D");
public static final Pattern GERMAN_TIME_PATTERN = Pattern.compile("(\\d\\d?):(\\d\\d?)(:(\\d\\d?))?\\D");
private static final Pattern BG_IMAGE_URL = Pattern.compile("background(-image)?:\\surl\\('?([^)]+)'?\\)");
@ -251,4 +252,17 @@ public class Util { @@ -251,4 +252,17 @@ public class Util {
return error(e, "Failed to create URL of %s", url);
}
}
public static Result<LocalDate> parseGermanDateWithoutYear(String string) {
var matcher = GERMAN_DATE_WITHOUT_YEAR.matcher(string);
if (matcher.find()){
int day = Integer.parseInt(matcher.group(1));
int mon = Integer.parseInt(matcher.group(2));
var now = LocalDate.now();
var date = LocalDate.of(now.getYear(),mon,day);
if (date.isBefore(now)) date = date.withYear(now.getYear()+1);
return Payload.of(date);
}
return error("Failed to parse date from %s",string);
}
}

1
de.srsoftware.cal.db/src/main/java/de/srsoftware/cal/db/MariaDB.java

@ -12,7 +12,6 @@ import static de.srsoftware.tools.jdbc.Query.*; @@ -12,7 +12,6 @@ import static de.srsoftware.tools.jdbc.Query.*;
import static java.lang.System.Logger.Level.*;
import de.srsoftware.cal.BaseAppointment;
import de.srsoftware.cal.Util;
import de.srsoftware.cal.api.Appointment;
import de.srsoftware.cal.api.Attachment;
import de.srsoftware.cal.api.Link;

19
de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/erfurt/Eburg.java

@ -1,5 +1,13 @@ @@ -1,5 +1,13 @@
/* © SRSoftware 2024 */
package de.srsoftware.cal.importer.erfurt;
import static de.srsoftware.cal.Util.url;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.CLASS;
import static de.srsoftware.tools.Tag.HREF;
import static de.srsoftware.tools.TagFilter.*;
import de.srsoftware.cal.BaseImporter;
import de.srsoftware.cal.Util;
import de.srsoftware.cal.api.Coords;
@ -7,8 +15,6 @@ import de.srsoftware.cal.api.Link; @@ -7,8 +15,6 @@ import de.srsoftware.cal.api.Link;
import de.srsoftware.tools.Payload;
import de.srsoftware.tools.Result;
import de.srsoftware.tools.Tag;
import de.srsoftware.tools.TagFilter;
import java.security.NoSuchAlgorithmException;
import java.time.LocalDate;
import java.time.LocalTime;
@ -17,15 +23,6 @@ import java.util.Objects; @@ -17,15 +23,6 @@ import java.util.Objects;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import static de.srsoftware.cal.Util.dump;
import static de.srsoftware.cal.Util.url;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.CLASS;
import static de.srsoftware.tools.Tag.HREF;
import static de.srsoftware.tools.TagFilter.*;
import static java.lang.System.Logger.Level.INFO;
public class Eburg extends BaseImporter {
private static final Coords DEFAULT_COORDS = new Coords(50.97840, 11.027004);
private static final Pattern DATE_PATTERN = Pattern.compile("(\\d\\d?)\\.\\s*(\\w+)\\W+(\\d\\d?)[.:](\\d\\d?)");

21
de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/erfurt/FromHell.java

@ -1,7 +1,16 @@ @@ -1,7 +1,16 @@
/* © SRSoftware 2024 */
package de.srsoftware.cal.importer.erfurt;
import static de.srsoftware.cal.Util.parseGermanTime;
import static de.srsoftware.cal.Util.parseLongGermanDate;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Optionals.nullIfEmpty;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.*;
import static de.srsoftware.tools.TagFilter.*;
import static java.nio.charset.StandardCharsets.UTF_8;
import de.srsoftware.cal.BaseImporter;
import de.srsoftware.cal.Util;
import de.srsoftware.cal.api.Attachment;
import de.srsoftware.cal.api.Coords;
import de.srsoftware.cal.api.Link;
@ -9,7 +18,6 @@ import de.srsoftware.tools.Payload; @@ -9,7 +18,6 @@ import de.srsoftware.tools.Payload;
import de.srsoftware.tools.Result;
import de.srsoftware.tools.Strings;
import de.srsoftware.tools.Tag;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@ -22,15 +30,6 @@ import java.util.List; @@ -22,15 +30,6 @@ import java.util.List;
import java.util.Objects;
import java.util.function.Predicate;
import static de.srsoftware.cal.Util.parseGermanTime;
import static de.srsoftware.cal.Util.parseLongGermanDate;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Optionals.nullIfEmpty;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.*;
import static de.srsoftware.tools.TagFilter.*;
import static java.nio.charset.StandardCharsets.UTF_8;
public class FromHell extends BaseImporter {
private static final Coords DEFAULT_COORDS = new Coords(50.97372, 10.9541);

229
de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/erfurt/GewerkschaftshausErfurt.java

@ -0,0 +1,229 @@ @@ -0,0 +1,229 @@
/* © SRSoftware 2024 */
package de.srsoftware.cal.importer.erfurt;
import static de.srsoftware.cal.Util.parseGermanTime;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.*;
import static de.srsoftware.tools.TagFilter.*;
import static java.nio.charset.StandardCharsets.UTF_8;
import de.srsoftware.cal.BaseImporter;
import de.srsoftware.cal.Util;
import de.srsoftware.cal.api.Attachment;
import de.srsoftware.cal.api.Coords;
import de.srsoftware.cal.api.Link;
import de.srsoftware.tools.Payload;
import de.srsoftware.tools.Result;
import de.srsoftware.tools.Tag;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.NoSuchAlgorithmException;
import java.time.LocalDate;
import java.time.LocalTime;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.function.Predicate;
public class GewerkschaftshausErfurt extends BaseImporter {
private static final String MUSEUMSKELLER = "Museumskeller, Juri-Gagarin-Ring 140a, 99084 Erfurt";
private static final String HSD_LOC = "Gewerkschaftshaus, Juri-Gagarin-Ring 150, 99084 Erfurt";
private static final Coords COORDS_MUSEUM = new Coords(50.98196, 11.03554);
private static final Coords COORDS_HSD = new Coords(50.98271, 11.0349);
public GewerkschaftshausErfurt() throws NoSuchAlgorithmException {
super();
}
@Override
protected String baseUrl() {
return "https://hsd-erfurt.de";
}
@Override
public String description() {
return "Importer für Events des Gewerkschaftshaus` Erfurt";
}
@Override
protected List<Attachment> extractAttachments(Tag eventTag) {
return super.extractAttachments(eventTag).stream()
.filter(att -> !att.url().toString().contains("/img/social"))
.toList();
}
@Override
protected Predicate<Tag> extractAttachmentsFilter() {
return attributeHas(ID,"main-content");
}
@Override
protected Predicate<Tag> extractDescriptionFilter() {
return attributeHas(CLASS,"all-description");
}
@Override
protected Result<Coords> extractCoords(Tag eventTag) {
var res = super.extractLocation(eventTag);
if (res.optional().isEmpty()) return transform(res);
var location = res.optional().get();
var lower = location.toLowerCase();
if (lower.contains("museum")) return Payload.of(COORDS_MUSEUM);
if (lower.contains("keller")) return Payload.of(COORDS_MUSEUM);
if (lower.contains("hsd")) return Payload.of(COORDS_HSD);
if (lower.contains("haus")) return Payload.of(COORDS_HSD);
if (lower.contains("bier")) return Payload.of(COORDS_MUSEUM);
if (lower.contains("garten")) return Payload.of(COORDS_MUSEUM);
return error("unknown location: %s",location);
}
@Override
protected Predicate<Tag> extractEndDateFilter() {
return null;
}
@Override
protected Predicate<Tag> extractEndTimeFilter() {
return null;
}
@Override
protected Predicate<Tag> extractEventTagFilter() {
return attributeHas(ID,"main-content");
}
@Override
protected Result<List<String>> extractEventUrls(Result<Tag> programPage) {
var opt = programPage.optional();
if (opt.isEmpty()) return transform(programPage);
var list = opt.get().find(attributeEquals(ID,"events-list")).stream()
.flatMap(tag -> tag.find(IS_ANCHOR).stream())
.map(a -> a.get(HREF))
.filter(Objects::nonNull)
.map(link -> link.contains("://") ? link : baseUrl()+link)
.filter(link -> link.contains("/events/"))
.distinct()
.toList();
return Payload.of(list);
}
@Override
protected List<Link> extractLinks(Tag appointmentTag) {
return super.extractLinks(appointmentTag).stream()
.filter(link -> !link.url().toString().contains("#"))
.toList();
}
@Override
protected Predicate<Tag> extractLinksFilter() {
return attributeHas(CLASS,"event-details");
}
@Override
protected Result<String> extractLocation(Tag eventTag) {
var res = super.extractLocation(eventTag);
if (res.optional().isEmpty()) return transform(res);
var location = res.optional().get();
var lower = location.toLowerCase();
if (lower.contains("museum")) return Payload.of(MUSEUMSKELLER);
if (lower.contains("keller")) return Payload.of(MUSEUMSKELLER);
if (lower.contains("hsd")) return Payload.of(HSD_LOC);
if (lower.contains("haus")) return Payload.of(HSD_LOC);
if (lower.contains("bier")) return Payload.of(MUSEUMSKELLER);
if (lower.contains("garten")) return Payload.of(MUSEUMSKELLER);
LOG.log(System.Logger.Level.WARNING, "unknown location: {0}",location);
return Payload.of(location);
}
@Override
protected Predicate<Tag> extractLocationFilter() {
return attributeHas(CLASS,"event-time-place");
}
@Override
protected Predicate<Tag> extractStartDateFilter() {
return attributeHas(CLASS,"event-date");
}
@Override
protected Predicate<Tag> extractStartTimeFilter() {
return attributeHas(CLASS,"event-time-place");
}
@Override
protected List<String> extractTags(Tag eventTag) {
var tags = new HashSet<String>();
tags.add("Erfurt");
var res = super.extractLocation(eventTag);
if (res.optional().isPresent()) {
var location = res.optional().get();
var lower = location.toLowerCase();
if (lower.contains("museum")||lower.contains("keller")||lower.contains("bier")||lower.contains("garten")) tags.add("Museumskeller");
if (lower.contains("hsd")||lower.contains("haus")) {
tags.add("HSD");
tags.add("Gewerkschaftshaus");
}
}
return List.copyOf(tags);
}
@Override
protected Predicate<Tag> extractTitleFilter() {
return ofType("h2");
}
@Override
protected Result<LocalDate> parseEndDate(String string) {
return null;
}
@Override
protected Result<LocalTime> parseEndTime(String string) {
return null;
}
@Override
protected Result<LocalDate> parseStartDate(String string) {
return Util.parseGermanDateWithoutYear(string);
}
@Override
protected Result<LocalTime> parseStartTime(String string) {
return parseGermanTime(string);
}
/**
* Die HSD haut einen haufen invaliden Code mit raus
* Also schneiden wir den kompletten header ab...
* @param inputStream eingehender InputStream, verpackt in Result
* @return ausgehender InputStream, verpackt in Result
*/
@Override
protected Result<InputStream> preload(Result<InputStream> inputStream) {
var opt = inputStream.optional();
if (opt.isEmpty()) return transform(inputStream);
try {
var input = opt.get();
var bos = new ByteArrayOutputStream();
input.transferTo(bos);
input.close();
String code = bos.toString(UTF_8)
// mitigate <img alt="image description contains title that contains single quote (') breaks parser" />
.replaceAll("(<img.*) alt=[\"][^\"]*[\"](.*>)","$1$2")
// mitigate <a script="anchor code contains title that contains unmatched single quote (') that breaks parser" />
.replaceAll("(<a.*) onclick=[\"][^\"]*[\"](.*>)","$1$2");
return Payload.of(new ByteArrayInputStream(code.getBytes(UTF_8)));
} catch (IOException e) {
return error(e, "Failed to buffer data from %s", inputStream);
}
}
@Override
protected String programURL() {
return baseUrl();
}
}

3
de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/weimar/CKeller.java

@ -2,7 +2,6 @@ @@ -2,7 +2,6 @@
package de.srsoftware.cal.importer.weimar;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Optionals.nullable;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.*;
import static de.srsoftware.tools.TagFilter.*;
@ -25,7 +24,6 @@ import java.time.LocalDateTime; @@ -25,7 +24,6 @@ import java.time.LocalDateTime;
import java.time.LocalTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@ -62,7 +60,6 @@ public class CKeller implements Importer { @@ -62,7 +60,6 @@ public class CKeller implements Importer {
private Result<List<Appointment>> extract(Result<Tag> tagResult) {
var list = new ArrayList<Appointment>();
if (tagResult.optional().isEmpty()) return transform(tagResult);
Util.dump(tagResult.optional().get());
var divs = tagResult.optional().get()
.find(attributeEquals(ID,"col2_content")).stream()
.flatMap(tag -> tag.children().stream())

14
de.srsoftware.cal.importer/src/test/java/RegexTest.java

@ -0,0 +1,14 @@ @@ -0,0 +1,14 @@
/* © SRSoftware 2024 */
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Test;
public class RegexTest {
@Test
public void test(){
var code = " <img id=\"nope\" alt=\"that's crap\" class=\"test\" >\n";
code = code.replaceAll("(<img.*) alt=[\"][^\"]*[\"](.*>)","$1$2");
assertEquals(" <img id=\"nope\" class=\"test\" >\n",code);
}
}
Loading…
Cancel
Save