From 3158db3bbd8d9693537762d9c18b7cfc0ed13750 Mon Sep 17 00:00:00 2001 From: Stephan Richter Date: Sat, 21 Dec 2024 21:26:04 +0100 Subject: [PATCH] added hash to appointment, implemented KassaBlanca parser Signed-off-by: Stephan Richter --- .../de/srsoftware/cal/api/Appointment.java | 6 + .../de/srsoftware/cal/app/Application.java | 11 +- de.srsoftware.cal.importer/build.gradle.kts | 4 +- .../cal/importer/BaseAppointment.java | 9 +- .../srsoftware/cal/importer/BaseImporter.java | 43 ++++-- .../cal/importer/jena/Kassablanca.java | 126 ++++++++++++++++++ .../cal/importer/jena/Rosenkeller.java | 5 + 7 files changed, 187 insertions(+), 17 deletions(-) create mode 100644 de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Kassablanca.java diff --git a/de.srsoftware.cal.api/src/main/java/de/srsoftware/cal/api/Appointment.java b/de.srsoftware.cal.api/src/main/java/de/srsoftware/cal/api/Appointment.java index 00add6c..ba93236 100644 --- a/de.srsoftware.cal.api/src/main/java/de/srsoftware/cal/api/Appointment.java +++ b/de.srsoftware.cal.api/src/main/java/de/srsoftware/cal/api/Appointment.java @@ -34,6 +34,12 @@ public interface Appointment { */ Optional end(); + /** + * create a unique identifier based on the event content + * @return + */ + String hash(); + /** * ID of the appointment – unique within this system * @return the appointment`s id diff --git a/de.srsoftware.cal.app/src/main/java/de/srsoftware/cal/app/Application.java b/de.srsoftware.cal.app/src/main/java/de/srsoftware/cal/app/Application.java index 90dc558..c8d1fe1 100644 --- a/de.srsoftware.cal.app/src/main/java/de/srsoftware/cal/app/Application.java +++ b/de.srsoftware.cal.app/src/main/java/de/srsoftware/cal/app/Application.java @@ -1,7 +1,8 @@ /* © SRSoftware 2024 */ package de.srsoftware.cal.app; -import de.srsoftware.cal.importer.jena.Rosenkeller; +import de.srsoftware.cal.importer.jena.Kassablanca; +import java.security.NoSuchAlgorithmException; /** * Test application @@ -14,9 +15,9 @@ public class Application { * sandbox * @param args default */ - public static void main(String[] args) { - var rosenkeller = new Rosenkeller(); - var appointments = rosenkeller.fetch(); - appointments.forEach(System.err::println); + public static void main(String[] args) throws NoSuchAlgorithmException { + var importer = new Kassablanca(); + var appointments = importer.fetch(); + appointments.forEach(System.out::println); } } diff --git a/de.srsoftware.cal.importer/build.gradle.kts b/de.srsoftware.cal.importer/build.gradle.kts index 2137ad6..e8b50a8 100644 --- a/de.srsoftware.cal.importer/build.gradle.kts +++ b/de.srsoftware.cal.importer/build.gradle.kts @@ -3,6 +3,6 @@ description = "OpenCloudCal : Importers" dependencies { implementation(project(":de.srsoftware.cal.api")) implementation("de.srsoftware:tools.optionals:1.0.0") - implementation("de.srsoftware:tools.util:1.1.2") - implementation("de.srsoftware:tools.web:1.3.2") + implementation("de.srsoftware:tools.util:1.1.3") + implementation("de.srsoftware:tools.web:1.3.3") } diff --git a/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseAppointment.java b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseAppointment.java index 2555df4..37a4ed7 100644 --- a/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseAppointment.java +++ b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseAppointment.java @@ -17,6 +17,7 @@ public class BaseAppointment implements Appointment { private final long id; private final String title, description; private final LocalDateTime end, start; + private final String hash; private Coords coords = null; private final Set attachments = new HashSet<>(); private final Set tags = new HashSet<>(); @@ -32,9 +33,10 @@ public class BaseAppointment implements Appointment { * @param end set the end date * @param location set the location */ - public BaseAppointment(long id, String title, String description, LocalDateTime start, LocalDateTime end, String location) { + public BaseAppointment(long id, String title, String description, LocalDateTime start, LocalDateTime end, String location, String hash) { this.description = description; this.end = end; + this.hash = hash; this.id = id; this.location = location; this.start = start; @@ -131,6 +133,11 @@ public class BaseAppointment implements Appointment { return nullable(end); } + @Override + public String hash() { + return hash; + } + @Override public long id() { return id; diff --git a/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseImporter.java b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseImporter.java index 9f93d46..5100811 100644 --- a/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseImporter.java +++ b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/BaseImporter.java @@ -1,7 +1,9 @@ /* © SRSoftware 2024 */ package de.srsoftware.cal.importer; +import static de.srsoftware.tools.Strings.hex; import static de.srsoftware.tools.TagFilter.ofType; +import static java.nio.charset.StandardCharsets.UTF_8; import de.srsoftware.cal.api.*; import de.srsoftware.tools.*; @@ -12,6 +14,8 @@ import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.List; @@ -20,6 +24,13 @@ import java.util.Optional; import java.util.stream.Stream; public abstract class BaseImporter implements Importer { + private static final String SHA256 = "SHA-256"; + private final MessageDigest digest; + + protected BaseImporter() throws NoSuchAlgorithmException { + digest = MessageDigest.getInstance(SHA256); + } + protected abstract String baseUrl(); @Override @@ -47,7 +58,7 @@ public abstract class BaseImporter implements Importer { } - protected Result extractDescription(Tag eventTag){ + protected Result extractDescription(Tag eventTag) { Result titleTag = extractDescriptionTag(eventTag); if (titleTag.optional().isEmpty()) return transform(titleTag); var inner = titleTag.optional().flatMap(tag -> tag.inner(2)); @@ -91,7 +102,9 @@ public abstract class BaseImporter implements Importer { if (locationResult.optional().isEmpty()) return transform(locationResult); var location = locationResult.optional().get(); - var event = new BaseAppointment(id, title, description, start, end, location) // + var hash = hash("%s@%s".formatted(start, location)); + + var event = new BaseAppointment(id, title, description, start, end, location, hash) // .add(extractAttachments(eventTag)) .addLinks(extractLinks(eventTag)) .tags(extractTags(eventTag)); @@ -139,7 +152,7 @@ public abstract class BaseImporter implements Importer { public abstract Result> extractLinkAnchors(Result tagResult); - private Result extractLinksTag(Tag eventTag) { + protected Result extractLinksTag(Tag eventTag) { return extractDescriptionTag(eventTag); } @@ -153,9 +166,9 @@ public abstract class BaseImporter implements Importer { protected Result extractStart(Tag eventTag) { - Result endTag = extractStartTag(eventTag); - if (endTag.optional().isEmpty()) return transform(endTag); - return parseStartDate(endTag.optional().get().toString(0)); + Result startTag = extractStartTag(eventTag); + if (startTag.optional().isEmpty()) return transform(startTag); + return parseStartDate(startTag.optional().get().strip()); } protected abstract Result extractStartTag(Tag eventTag); @@ -164,9 +177,9 @@ public abstract class BaseImporter implements Importer { protected abstract List extractTags(Tag eventTag); protected Result extractTitle(Tag eventTag) { - Result locationTag = extractTitleTag(eventTag); - if (locationTag.optional().isEmpty()) return transform(locationTag); - var inner = locationTag.optional().flatMap(tag -> tag.inner(2)); + Result titleTag = extractTitleTag(eventTag); + if (titleTag.optional().isEmpty()) return transform(titleTag); + var inner = titleTag.optional().flatMap(tag -> tag.inner(2)); if (inner.isPresent()) return Payload.of(inner.get()); return Error.of("No title found"); } @@ -186,9 +199,21 @@ public abstract class BaseImporter implements Importer { return stream // .map(this::url) .map(this::loadEvent) + .peek(e -> { + if (e instanceof Error err) System.err.println(err); + }) .flatMap(result -> result.optional().stream()); } + /** + * create a hash from a text + * @param plain the plain text + * @return the hash of the plain text + */ + protected String hash(String plain){ + return hex(digest.digest(plain.getBytes(UTF_8))); + } + protected static Result invalidParameter(Result result) { return Error.format("Invalid parameter: %s", result.getClass().getSimpleName()); } diff --git a/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Kassablanca.java b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Kassablanca.java new file mode 100644 index 0000000..dafcc43 --- /dev/null +++ b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Kassablanca.java @@ -0,0 +1,126 @@ +/* © SRSoftware 2024 */ +package de.srsoftware.cal.importer.jena; + +import static de.srsoftware.tools.TagFilter.*; + +import de.srsoftware.cal.importer.BaseImporter; +import de.srsoftware.tools.*; +import de.srsoftware.tools.Error; +import java.security.NoSuchAlgorithmException; +import java.time.LocalDateTime; +import java.util.List; +import java.util.regex.Pattern; + +public class Kassablanca extends BaseImporter { + public static final String BASE_URL = "https://www.kassablanca.de"; + private static final String APPOINTMENT_TAG_ID = "entry-content"; + private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d+).(\\d+).(\\d+).*Beginn\\s*(\\d+):(\\d+)\\s*Uhr"); + private static final String LOCATION = "Kassablanca e.V., Felsenkellerstr. 13a, 07745 Jena"; + + public Kassablanca() throws NoSuchAlgorithmException { + super(); + } + + @Override + protected String baseUrl() { + return BASE_URL; + } + + @Override + protected Result extractDescriptionTag(Tag eventTag) { + var list = eventTag.find(attributeHas("class", "se-content")); + if (list.size() == 1) return Payload.of(list.getFirst()); + return Error.of("Failed to find description tag"); + } + + @Override + protected Result extractEndTag(Tag eventTag) { + return Error.format("end date not supported"); + } + + @Override + protected Result extractEventTag(Result pageResult) { + if (pageResult.optional().isEmpty()) return transform(pageResult); + var list = pageResult.optional().get().find(attributeEquals("class", APPOINTMENT_TAG_ID)); + if (list.size() == 1) return Payload.of(list.getFirst()); + return Error.format("Could not find tag with id \"%s\"", APPOINTMENT_TAG_ID); + } + + @Override + protected Result> extractEventUrls(Result programPage) { + if (programPage.optional().isEmpty()) return transform(programPage); + List list = programPage.optional() + .get() // + .find(attributeHas("class", "eventrow")) + .stream() + .flatMap(t -> t.find(ofType("h3")).stream()) + .map(t -> t.find(ofType("a"))) + .flatMap(List::stream) + .map(t -> t.get("href")) + .toList(); + return Payload.of(list); + } + + @Override + public Result> extractLinkAnchors(Result tagResult) { + if (tagResult.optional().isEmpty()) return transform(tagResult); + var tag = tagResult.optional().get(); + tag.find(attributeEquals("id", "filterbar")).stream().findAny().ifPresent(Tag::remove); // remove div with unrelated links + var anchors = tag.find(withAttribute("href")); + return Payload.of(anchors); + } + + @Override + protected Result extractLinksTag(Tag eventTag) { + return Payload.of(eventTag); + } + + @Override + protected Result extractLocationTag(Tag eventTag) { + return Payload.of(new Text(LOCATION)); + } + + @Override + protected Result extractStartTag(Tag eventTag) { + List tags = eventTag.find(attributeEquals("class", "se-header")); + if (tags.size() == 1) return Payload.of(tags.getFirst()); + return Error.of("Failed to find event time information"); + } + + @Override + protected List extractTags(Tag eventTag) { + return List.of("Kassablanca"); + } + + @Override + protected Result extractTitleTag(Tag eventTag) { + var list = eventTag.find(ofType("h1")); + if (list.size() == 1) return Payload.of(list.getFirst()); + return Error.of("Failed to find title tag"); + } + + @Override + protected Result parseEndDate(String string) { + return null; + } + + @Override + protected Result parseStartDate(String string) { + var matcher = START_DATE_PATTERN.matcher(string); + if (matcher.find()) { + var day = Integer.parseInt(matcher.group(1)); + var month = Integer.parseInt(matcher.group(2)); + var year = Integer.parseInt(matcher.group(3)); + var hour = Integer.parseInt(matcher.group(4)); + var minute = Integer.parseInt(matcher.group(5)); + var date = LocalDateTime.of(year, month, day, hour, minute); + return Payload.of(date); + } + return Error.of("Could not recognize start date/time"); + } + + @Override + protected String programURL() { + return BASE_URL + "/programm"; + } +} diff --git a/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Rosenkeller.java b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Rosenkeller.java index 454cdf4..47d6ac1 100644 --- a/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Rosenkeller.java +++ b/de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/Rosenkeller.java @@ -9,6 +9,7 @@ import de.srsoftware.tools.Error; import de.srsoftware.tools.Payload; import de.srsoftware.tools.Result; import de.srsoftware.tools.Tag; +import java.security.NoSuchAlgorithmException; import java.time.LocalDateTime; import java.util.List; import java.util.regex.Pattern; @@ -19,6 +20,10 @@ public class Rosenkeller extends BaseImporter { private static final Pattern DATE_PATTERN = Pattern.compile("(\\d+) (\\w+)(\\W+(\\d+):(\\d+))?"); private static final String DEFAULT_LOCATION = "Rosenkeller, Johannisstr. 13, 07743 Jena"; + public Rosenkeller() throws NoSuchAlgorithmException { + super(); + } + @Override protected String baseUrl() { return BASE_URL;