started another importer overhaul

Signed-off-by: Stephan Richter <s.richter@srsoftware.de>
This commit is contained in:
2025-01-01 16:11:30 +01:00
parent d9e5475962
commit 3c7313fc01
9 changed files with 527 additions and 257 deletions

View File

@@ -0,0 +1,143 @@
/* © SRSoftware 2024 */
package de.srsoftware.cal.importer.jena;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.TagFilter.*;
import static java.nio.charset.StandardCharsets.UTF_8;
import de.srsoftware.cal.BaseImporter;
import de.srsoftware.tools.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.time.LocalDate;
import java.util.List;
import java.util.regex.Pattern;
public abstract class CosmicDawn extends BaseImporter {
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d\\d?).(\\d\\d?).(\\d{4}).*(\\d\\d?):(\\d\\d?)");
public CosmicDawn() throws NoSuchAlgorithmException {
super();
}
@Override
protected String baseUrl() {
return "https://www.kuba-jena.de";
}
@Override
protected Result<Tag> extractDescriptionTag(Tag eventTag) {
var list = eventTag.find(attributeEndsWith("class","event-body-content"));
return list.isEmpty() ? error("failed to find <div class=\"…event-body-content\">") : Payload.of(list.getFirst());
}
protected Result<Tag> extractEndTag(Tag eventTag) {
return null;
}
@Override
protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
if (pageResult.optional().isEmpty()) return transform(pageResult);
List<Tag> list = pageResult.optional().get().find(attributeEquals("class", "inside-article"));
return (list.isEmpty()) ? error("Failed to find <div class=\"inside-article\">!") : Payload.of(list.getFirst());
}
@Override
protected Result<List<String>> extractEventUrls(Result<Tag> programPage) {
var page = programPage.optional();
if (page.isEmpty()) return transform(programPage);
try {
Files.writeString(Path.of("/tmp/test.txt"),page.get().toString(2));
} catch (IOException e) {
throw new RuntimeException(e);
}
var list = page.get().find(attributeEquals("class","event_listings_main"));
var urlList = list.stream()
.flatMap(tag -> tag.find(IS_ANCHOR).stream())
.map(tag -> tag.get("href"))
.toList();
return Payload.of(urlList);
}
@Override
protected Result<Tag> extractLocationTag(Tag eventTag) {
return null;
}
protected Result<Tag> extractStartTag(Tag eventTag) {
var dateTags = eventTag.find(attributeContains("class","event-date-time"));
if (dateTags.isEmpty()) return error("Start date not found!");
var times = eventTag.find(attributeEquals("class","event_time")).stream()
.flatMap(tag -> tag.find(IS_SPAN).stream())
.filter(tag -> tag.toString().contains("Begin"))
.toList();
if (times.isEmpty()) return error("Start time not found!");
var div = Tag.of("div").add(dateTags.getFirst()).add(times.getFirst());
return Payload.of(div);
}
@Override
protected List<String> extractTags(Tag eventTag) {
return List.of();
}
@Override
protected Result<Tag> extractTitleTag(Tag eventTag) {
var list = eventTag.find(ofType("h1"));
return list.isEmpty() ? error("failed to find <h1>") : Payload.of(list.getFirst());
}
@Override
protected Result<LocalDate> parseEndDate(String string) {
return null;
}
@Override
protected Result<LocalDate> parseStartDate(String date) {
var matcher = START_DATE_PATTERN.matcher(date);
if (matcher.find()){
int day = Integer.parseInt(matcher.group(1));
int mon = Integer.parseInt(matcher.group(2));
int year= Integer.parseInt(matcher.group(3));
int hour = Integer.parseInt(matcher.group(4));
int min = Integer.parseInt(matcher.group(5));
}
return null;
}
/**
* Die Kuba-Seite haut einen haufen Script mit raus, der dazu führt, dass die Tags nicht richtig geparsed werden.
* Also schneiden wir den kompletten header ab...
* @param inputStream eingehender InputStream, verpackt in Result
* @return ausgehender InputStream, verpackt in Result
*/
@Override
protected Result<InputStream> preload(Result<InputStream> inputStream) {
var opt = inputStream.optional();
if (opt.isEmpty()) return transform(inputStream);
try {
var input = opt.get();
var bos = new ByteArrayOutputStream();
input.transferTo(bos);
input.close();
String code = bos.toString(UTF_8);
var pos = code.indexOf("<body");
return Payload.of(new ByteArrayInputStream(code.substring(pos).getBytes(UTF_8)));
} catch (IOException e) {
return error(e, "Failed to buffer data from %s", inputStream);
}
}
@Override
protected String programURL() {
return baseUrl()+"/veranstaltungen/";
//return "http://httpbin.org/headers";
}
}

View File

@@ -1,24 +1,25 @@
/* © SRSoftware 2024 */
package de.srsoftware.cal.importer.jena;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.cal.Util.*;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.CLASS;
import static de.srsoftware.tools.Tag.DIV;
import static de.srsoftware.tools.TagFilter.*;
import de.srsoftware.cal.BaseImporter;
import de.srsoftware.cal.api.Coords;
import de.srsoftware.tools.*;
import java.security.NoSuchAlgorithmException;
import java.time.LocalDateTime;
import java.time.LocalDate;
import java.time.LocalTime;
import java.util.List;
import java.util.regex.Pattern;
import java.util.function.Predicate;
public class Kassablanca extends BaseImporter {
public static final String BASE_URL = "https://www.kassablanca.de";
private static final String APPOINTMENT_TAG_ID = "entry-content";
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d+).(\\d+).(\\d+).*Beginn\\s*(\\d+):(\\d+)\\s*Uhr");
private static final String LOCATION = "Kassablanca e.V., Felsenkellerstr. 13a, 07745 Jena";
private static final Coords COORDS = new Coords(50.92093, 11.57788);
private static final String LOCATION = "Kassablanca e.V., Felsenkellerstr. 13a, 07745 Jena";
public Kassablanca() throws NoSuchAlgorithmException {
super();
@@ -29,70 +30,87 @@ public class Kassablanca extends BaseImporter {
return BASE_URL;
}
@Override
public String description() {
return "Importiert Events des Studentenclubs „Kassablanca“ in Jena";
}
@Override
protected Predicate<Tag> extractAttachmentsFilter() {
return attributeEquals(CLASS,"entry-content");
}
@Override
protected Predicate<Tag> extractDescriptionFilter() {
return attributeEquals(CLASS,"se-content");
}
@Override
protected Result<Coords> extractCoords(Tag eventTag) {
return Payload.of(COORDS);
}
@Override
protected Result<Tag> extractDescriptionTag(Tag eventTag) {
var list = eventTag.find(attributeHas("class", "se-content"));
if (list.size() == 1) return Payload.of(list.getFirst());
return error("Failed to find description tag");
protected Predicate<Tag> extractEndDateFilter() {
return null;
}
@Override
protected Result<Tag> extractEndTag(Tag eventTag) {
return error("end date not supported");
protected Predicate<Tag> extractEndTimeFilter() {
return null;
}
@Override
protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
if (pageResult.optional().isEmpty()) return transform(pageResult);
var list = pageResult.optional().get().find(attributeEquals("class", APPOINTMENT_TAG_ID));
if (list.size() == 1) return Payload.of(list.getFirst());
return error("Could not find tag with id \"%s\"", APPOINTMENT_TAG_ID);
protected Predicate<Tag> extractEventTagFilter() {
return attributeEquals(CLASS,"entry-content");
}
@Override
protected Result<List<String>> extractEventUrls(Result<Tag> programPage) {
if (programPage.optional().isEmpty()) return transform(programPage);
List<String> list = programPage.optional()
.get() //
.find(attributeHas("class", "eventrow"))
.stream()
.flatMap(t -> t.find(ofType("h3")).stream())
.map(t -> t.find(ofType("a")))
.flatMap(List::stream)
.map(t -> t.get("href"))
.toList();
.get() //
.find(attributeHas("class", "eventrow"))
.stream()
.flatMap(t -> t.find(ofType("h3")).stream())
.map(t -> t.find(IS_ANCHOR))
.flatMap(List::stream)
.map(t -> t.get("href"))
.toList();
return Payload.of(list);
}
@Override
public Result<List<Tag>> extractLinkAnchors(Result<Tag> tagResult) {
if (tagResult.optional().isEmpty()) return transform(tagResult);
var tag = tagResult.optional().get();
tag.find(attributeEquals("id", "filterbar")).stream().findAny().ifPresent(Tag::remove); // remove div with unrelated links
var anchors = tag.find(withAttribute("href"));
return Payload.of(anchors);
protected Predicate<Tag> extractLinksFilter() {
return attributeEquals(CLASS,"se-container");
}
@Override
protected Result<Tag> extractLinksTag(Tag eventTag) {
return Payload.of(eventTag);
var top = eventTag.find(attributeEquals(CLASS,"se-container"));
var bottom = eventTag.find(attributeEquals(CLASS, "se-content"));
var common = Tag.of(DIV).addAll(top).addAll(bottom);
return Payload.of(common);
}
@Override
protected Result<Tag> extractLocationTag(Tag eventTag) {
return Payload.of(new Text(LOCATION));
protected Result<String> extractLocation(Tag eventTag) {
return Payload.of(LOCATION);
}
@Override
protected Result<Tag> extractStartTag(Tag eventTag) {
List<Tag> tags = eventTag.find(attributeEquals("class", "se-header"));
if (tags.size() == 1) return Payload.of(tags.getFirst());
return error("Failed to find event time information");
protected Predicate<Tag> extractLocationFilter() {
return null;
}
@Override
protected Predicate<Tag> extractStartDateFilter() {
return attributeEquals(CLASS,"se-header");
}
@Override
protected Predicate<Tag> extractStartTimeFilter() {
return attributeEquals(CLASS,"se-header");
}
@Override
@@ -101,30 +119,28 @@ public class Kassablanca extends BaseImporter {
}
@Override
protected Result<Tag> extractTitleTag(Tag eventTag) {
var list = eventTag.find(ofType("h1"));
if (list.size() == 1) return Payload.of(list.getFirst());
return error("Failed to find title tag");
protected Predicate<Tag> extractTitleFilter() {
return ofType("h1");
}
@Override
protected Result<LocalDateTime> parseEndDate(String string) {
protected Result<LocalDate> parseEndDate(String string) {
return null;
}
@Override
protected Result<LocalDateTime> parseStartDate(String string) {
var matcher = START_DATE_PATTERN.matcher(string);
if (matcher.find()) {
var day = Integer.parseInt(matcher.group(1));
var month = Integer.parseInt(matcher.group(2));
var year = Integer.parseInt(matcher.group(3));
var hour = Integer.parseInt(matcher.group(4));
var minute = Integer.parseInt(matcher.group(5));
var date = LocalDateTime.of(year, month, day, hour, minute);
return Payload.of(date);
}
return error("Could not recognize start date/time");
protected Result<LocalTime> parseEndTime(String string) {
return null;
}
@Override
protected Result<LocalDate> parseStartDate(String string) {
return parseGermanDate(string);
}
@Override
protected Result<LocalTime> parseStartTime(String string) {
return parseGermanTime(string);
}
@Override

View File

@@ -12,11 +12,12 @@ import de.srsoftware.tools.Payload;
import de.srsoftware.tools.Result;
import de.srsoftware.tools.Tag;
import java.security.NoSuchAlgorithmException;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.List;
import java.util.regex.Pattern;
public class Rosenkeller extends BaseImporter {
public abstract class Rosenkeller extends BaseImporter {
private static final String APPOINTMENT_TAG_ID = "tribe-events-content";
private static final String BASE_URL = "https://rosenkeller.org";
private static final Pattern DATE_PATTERN = Pattern.compile("(\\d+) (\\w+)(\\W+(\\d+):(\\d+))?");
@@ -51,7 +52,7 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
return error("Failed to find description tag");
}
@Override
protected Result<Tag> extractEndTag(Tag eventTag) {
return error("extractEndTag(…) not supported");
}
@@ -78,19 +79,12 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
return Payload.of(list);
}
@Override
public Result<List<Tag>> extractLinkAnchors(Result<Tag> tagResult) {
if (tagResult.optional().isEmpty()) return transform(tagResult);
List<Tag> list = tagResult.optional().get().find(attributeStartsWith("id", "post-")).stream().flatMap(tag -> tag.find(ofType("a")).stream()).toList();
return Payload.of(list);
}
@Override
protected Result<Tag> extractLocationTag(Tag eventTag) {
return Payload.of(new Tag("span").content(DEFAULT_LOCATION));
}
@Override
protected Result<Tag> extractStartTag(Tag eventTag) {
List<Tag> list = eventTag.find(attributeEquals("class", "tribe-event-date-start"));
if (list.size() == 1) return Payload.of(list.getFirst());
@@ -110,12 +104,12 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
}
@Override
protected Result<LocalDateTime> parseEndDate(String text) {
protected Result<LocalDate> parseEndDate(String text) {
return error("parseEndDate(…) not supported");
}
@Override
protected Result<LocalDateTime> parseStartDate(String text) {
protected Result<LocalDate> parseStartDate(String text) {
var match = DATE_PATTERN.matcher(text);
if (match.find()) {
var dayOfMonth = Integer.parseInt(match.group(1));
@@ -127,7 +121,7 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
var now = LocalDateTime.now();
var date = LocalDateTime.of(now.getYear(), month.optional().get(), dayOfMonth, hour, minute);
if (date.isBefore(now)) date = date.plusYears(1);
return Payload.of(date);
//return Payload.of(date);
}
return error("Failed to recognize a date in \"%s\"", text);
}