started another importer overhaul
Signed-off-by: Stephan Richter <s.richter@srsoftware.de>
This commit is contained in:
@@ -0,0 +1,143 @@
|
||||
/* © SRSoftware 2024 */
|
||||
package de.srsoftware.cal.importer.jena;
|
||||
|
||||
import static de.srsoftware.tools.Error.error;
|
||||
import static de.srsoftware.tools.Result.transform;
|
||||
import static de.srsoftware.tools.TagFilter.*;
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
|
||||
import de.srsoftware.cal.BaseImporter;
|
||||
import de.srsoftware.tools.*;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.LocalDate;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public abstract class CosmicDawn extends BaseImporter {
|
||||
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d\\d?).(\\d\\d?).(\\d{4}).*(\\d\\d?):(\\d\\d?)");
|
||||
|
||||
public CosmicDawn() throws NoSuchAlgorithmException {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String baseUrl() {
|
||||
return "https://www.kuba-jena.de";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractDescriptionTag(Tag eventTag) {
|
||||
var list = eventTag.find(attributeEndsWith("class","event-body-content"));
|
||||
return list.isEmpty() ? error("failed to find <div class=\"…event-body-content\">") : Payload.of(list.getFirst());
|
||||
}
|
||||
|
||||
protected Result<Tag> extractEndTag(Tag eventTag) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
|
||||
if (pageResult.optional().isEmpty()) return transform(pageResult);
|
||||
List<Tag> list = pageResult.optional().get().find(attributeEquals("class", "inside-article"));
|
||||
return (list.isEmpty()) ? error("Failed to find <div class=\"inside-article\">!") : Payload.of(list.getFirst());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<List<String>> extractEventUrls(Result<Tag> programPage) {
|
||||
var page = programPage.optional();
|
||||
if (page.isEmpty()) return transform(programPage);
|
||||
try {
|
||||
Files.writeString(Path.of("/tmp/test.txt"),page.get().toString(2));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
var list = page.get().find(attributeEquals("class","event_listings_main"));
|
||||
var urlList = list.stream()
|
||||
.flatMap(tag -> tag.find(IS_ANCHOR).stream())
|
||||
.map(tag -> tag.get("href"))
|
||||
.toList();
|
||||
return Payload.of(urlList);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractLocationTag(Tag eventTag) {
|
||||
return null;
|
||||
}
|
||||
|
||||
protected Result<Tag> extractStartTag(Tag eventTag) {
|
||||
var dateTags = eventTag.find(attributeContains("class","event-date-time"));
|
||||
if (dateTags.isEmpty()) return error("Start date not found!");
|
||||
var times = eventTag.find(attributeEquals("class","event_time")).stream()
|
||||
.flatMap(tag -> tag.find(IS_SPAN).stream())
|
||||
.filter(tag -> tag.toString().contains("Begin"))
|
||||
.toList();
|
||||
if (times.isEmpty()) return error("Start time not found!");
|
||||
var div = Tag.of("div").add(dateTags.getFirst()).add(times.getFirst());
|
||||
return Payload.of(div);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> extractTags(Tag eventTag) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractTitleTag(Tag eventTag) {
|
||||
var list = eventTag.find(ofType("h1"));
|
||||
return list.isEmpty() ? error("failed to find <h1>") : Payload.of(list.getFirst());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalDate> parseEndDate(String string) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalDate> parseStartDate(String date) {
|
||||
var matcher = START_DATE_PATTERN.matcher(date);
|
||||
if (matcher.find()){
|
||||
int day = Integer.parseInt(matcher.group(1));
|
||||
int mon = Integer.parseInt(matcher.group(2));
|
||||
int year= Integer.parseInt(matcher.group(3));
|
||||
int hour = Integer.parseInt(matcher.group(4));
|
||||
int min = Integer.parseInt(matcher.group(5));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Die Kuba-Seite haut einen haufen Script mit raus, der dazu führt, dass die Tags nicht richtig geparsed werden.
|
||||
* Also schneiden wir den kompletten header ab...
|
||||
* @param inputStream eingehender InputStream, verpackt in Result
|
||||
* @return ausgehender InputStream, verpackt in Result
|
||||
*/
|
||||
@Override
|
||||
protected Result<InputStream> preload(Result<InputStream> inputStream) {
|
||||
var opt = inputStream.optional();
|
||||
if (opt.isEmpty()) return transform(inputStream);
|
||||
try {
|
||||
var input = opt.get();
|
||||
var bos = new ByteArrayOutputStream();
|
||||
input.transferTo(bos);
|
||||
input.close();
|
||||
String code = bos.toString(UTF_8);
|
||||
var pos = code.indexOf("<body");
|
||||
return Payload.of(new ByteArrayInputStream(code.substring(pos).getBytes(UTF_8)));
|
||||
} catch (IOException e) {
|
||||
return error(e, "Failed to buffer data from %s", inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String programURL() {
|
||||
return baseUrl()+"/veranstaltungen/";
|
||||
//return "http://httpbin.org/headers";
|
||||
}
|
||||
}
|
||||
@@ -1,24 +1,25 @@
|
||||
/* © SRSoftware 2024 */
|
||||
package de.srsoftware.cal.importer.jena;
|
||||
|
||||
import static de.srsoftware.tools.Error.error;
|
||||
import static de.srsoftware.cal.Util.*;
|
||||
import static de.srsoftware.tools.Result.transform;
|
||||
import static de.srsoftware.tools.Tag.CLASS;
|
||||
import static de.srsoftware.tools.Tag.DIV;
|
||||
import static de.srsoftware.tools.TagFilter.*;
|
||||
|
||||
import de.srsoftware.cal.BaseImporter;
|
||||
import de.srsoftware.cal.api.Coords;
|
||||
import de.srsoftware.tools.*;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalTime;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class Kassablanca extends BaseImporter {
|
||||
public static final String BASE_URL = "https://www.kassablanca.de";
|
||||
private static final String APPOINTMENT_TAG_ID = "entry-content";
|
||||
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d+).(\\d+).(\\d+).*Beginn\\s*(\\d+):(\\d+)\\s*Uhr");
|
||||
private static final String LOCATION = "Kassablanca e.V., Felsenkellerstr. 13a, 07745 Jena";
|
||||
private static final Coords COORDS = new Coords(50.92093, 11.57788);
|
||||
private static final String LOCATION = "Kassablanca e.V., Felsenkellerstr. 13a, 07745 Jena";
|
||||
|
||||
public Kassablanca() throws NoSuchAlgorithmException {
|
||||
super();
|
||||
@@ -29,70 +30,87 @@ public class Kassablanca extends BaseImporter {
|
||||
return BASE_URL;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String description() {
|
||||
return "Importiert Events des Studentenclubs „Kassablanca“ in Jena";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Predicate<Tag> extractAttachmentsFilter() {
|
||||
return attributeEquals(CLASS,"entry-content");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Predicate<Tag> extractDescriptionFilter() {
|
||||
return attributeEquals(CLASS,"se-content");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Coords> extractCoords(Tag eventTag) {
|
||||
return Payload.of(COORDS);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractDescriptionTag(Tag eventTag) {
|
||||
var list = eventTag.find(attributeHas("class", "se-content"));
|
||||
if (list.size() == 1) return Payload.of(list.getFirst());
|
||||
return error("Failed to find description tag");
|
||||
protected Predicate<Tag> extractEndDateFilter() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractEndTag(Tag eventTag) {
|
||||
return error("end date not supported");
|
||||
protected Predicate<Tag> extractEndTimeFilter() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
|
||||
if (pageResult.optional().isEmpty()) return transform(pageResult);
|
||||
var list = pageResult.optional().get().find(attributeEquals("class", APPOINTMENT_TAG_ID));
|
||||
if (list.size() == 1) return Payload.of(list.getFirst());
|
||||
return error("Could not find tag with id \"%s\"", APPOINTMENT_TAG_ID);
|
||||
protected Predicate<Tag> extractEventTagFilter() {
|
||||
return attributeEquals(CLASS,"entry-content");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<List<String>> extractEventUrls(Result<Tag> programPage) {
|
||||
if (programPage.optional().isEmpty()) return transform(programPage);
|
||||
List<String> list = programPage.optional()
|
||||
.get() //
|
||||
.find(attributeHas("class", "eventrow"))
|
||||
.stream()
|
||||
.flatMap(t -> t.find(ofType("h3")).stream())
|
||||
.map(t -> t.find(ofType("a")))
|
||||
.flatMap(List::stream)
|
||||
.map(t -> t.get("href"))
|
||||
.toList();
|
||||
.get() //
|
||||
.find(attributeHas("class", "eventrow"))
|
||||
.stream()
|
||||
.flatMap(t -> t.find(ofType("h3")).stream())
|
||||
.map(t -> t.find(IS_ANCHOR))
|
||||
.flatMap(List::stream)
|
||||
.map(t -> t.get("href"))
|
||||
.toList();
|
||||
return Payload.of(list);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Result<List<Tag>> extractLinkAnchors(Result<Tag> tagResult) {
|
||||
if (tagResult.optional().isEmpty()) return transform(tagResult);
|
||||
var tag = tagResult.optional().get();
|
||||
tag.find(attributeEquals("id", "filterbar")).stream().findAny().ifPresent(Tag::remove); // remove div with unrelated links
|
||||
var anchors = tag.find(withAttribute("href"));
|
||||
return Payload.of(anchors);
|
||||
protected Predicate<Tag> extractLinksFilter() {
|
||||
return attributeEquals(CLASS,"se-container");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractLinksTag(Tag eventTag) {
|
||||
return Payload.of(eventTag);
|
||||
var top = eventTag.find(attributeEquals(CLASS,"se-container"));
|
||||
var bottom = eventTag.find(attributeEquals(CLASS, "se-content"));
|
||||
var common = Tag.of(DIV).addAll(top).addAll(bottom);
|
||||
return Payload.of(common);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractLocationTag(Tag eventTag) {
|
||||
return Payload.of(new Text(LOCATION));
|
||||
protected Result<String> extractLocation(Tag eventTag) {
|
||||
return Payload.of(LOCATION);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractStartTag(Tag eventTag) {
|
||||
List<Tag> tags = eventTag.find(attributeEquals("class", "se-header"));
|
||||
if (tags.size() == 1) return Payload.of(tags.getFirst());
|
||||
return error("Failed to find event time information");
|
||||
protected Predicate<Tag> extractLocationFilter() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Predicate<Tag> extractStartDateFilter() {
|
||||
return attributeEquals(CLASS,"se-header");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Predicate<Tag> extractStartTimeFilter() {
|
||||
return attributeEquals(CLASS,"se-header");
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -101,30 +119,28 @@ public class Kassablanca extends BaseImporter {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractTitleTag(Tag eventTag) {
|
||||
var list = eventTag.find(ofType("h1"));
|
||||
if (list.size() == 1) return Payload.of(list.getFirst());
|
||||
return error("Failed to find title tag");
|
||||
protected Predicate<Tag> extractTitleFilter() {
|
||||
return ofType("h1");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalDateTime> parseEndDate(String string) {
|
||||
protected Result<LocalDate> parseEndDate(String string) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalDateTime> parseStartDate(String string) {
|
||||
var matcher = START_DATE_PATTERN.matcher(string);
|
||||
if (matcher.find()) {
|
||||
var day = Integer.parseInt(matcher.group(1));
|
||||
var month = Integer.parseInt(matcher.group(2));
|
||||
var year = Integer.parseInt(matcher.group(3));
|
||||
var hour = Integer.parseInt(matcher.group(4));
|
||||
var minute = Integer.parseInt(matcher.group(5));
|
||||
var date = LocalDateTime.of(year, month, day, hour, minute);
|
||||
return Payload.of(date);
|
||||
}
|
||||
return error("Could not recognize start date/time");
|
||||
protected Result<LocalTime> parseEndTime(String string) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalDate> parseStartDate(String string) {
|
||||
return parseGermanDate(string);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalTime> parseStartTime(String string) {
|
||||
return parseGermanTime(string);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -12,11 +12,12 @@ import de.srsoftware.tools.Payload;
|
||||
import de.srsoftware.tools.Result;
|
||||
import de.srsoftware.tools.Tag;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class Rosenkeller extends BaseImporter {
|
||||
public abstract class Rosenkeller extends BaseImporter {
|
||||
private static final String APPOINTMENT_TAG_ID = "tribe-events-content";
|
||||
private static final String BASE_URL = "https://rosenkeller.org";
|
||||
private static final Pattern DATE_PATTERN = Pattern.compile("(\\d+) (\\w+)(\\W+(\\d+):(\\d+))?");
|
||||
@@ -51,7 +52,7 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
|
||||
return error("Failed to find description tag");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
protected Result<Tag> extractEndTag(Tag eventTag) {
|
||||
return error("extractEndTag(…) not supported");
|
||||
}
|
||||
@@ -78,19 +79,12 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
|
||||
return Payload.of(list);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Result<List<Tag>> extractLinkAnchors(Result<Tag> tagResult) {
|
||||
if (tagResult.optional().isEmpty()) return transform(tagResult);
|
||||
List<Tag> list = tagResult.optional().get().find(attributeStartsWith("id", "post-")).stream().flatMap(tag -> tag.find(ofType("a")).stream()).toList();
|
||||
return Payload.of(list);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractLocationTag(Tag eventTag) {
|
||||
return Payload.of(new Tag("span").content(DEFAULT_LOCATION));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<Tag> extractStartTag(Tag eventTag) {
|
||||
List<Tag> list = eventTag.find(attributeEquals("class", "tribe-event-date-start"));
|
||||
if (list.size() == 1) return Payload.of(list.getFirst());
|
||||
@@ -110,12 +104,12 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalDateTime> parseEndDate(String text) {
|
||||
protected Result<LocalDate> parseEndDate(String text) {
|
||||
return error("parseEndDate(…) not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Result<LocalDateTime> parseStartDate(String text) {
|
||||
protected Result<LocalDate> parseStartDate(String text) {
|
||||
var match = DATE_PATTERN.matcher(text);
|
||||
if (match.find()) {
|
||||
var dayOfMonth = Integer.parseInt(match.group(1));
|
||||
@@ -127,7 +121,7 @@ private static final Coords COORDS = new Coords(50.92945, 11.58491);
|
||||
var now = LocalDateTime.now();
|
||||
var date = LocalDateTime.of(now.getYear(), month.optional().get(), dayOfMonth, hour, minute);
|
||||
if (date.isBefore(now)) date = date.plusYears(1);
|
||||
return Payload.of(date);
|
||||
//return Payload.of(date);
|
||||
}
|
||||
return error("Failed to recognize a date in \"%s\"", text);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user