Browse Source

refactored CosmicDawn importer - now working!

Signed-off-by: Stephan Richter <s.richter@srsoftware.de>
main
Stephan Richter 4 months ago
parent
commit
e0dde9aa9e
  1. 2
      de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/Util.java
  2. 109
      de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/CosmicDawn.java
  3. 2
      de.srsoftware.cal.web/src/main/resources/script/event.js

2
de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/Util.java

@ -92,7 +92,7 @@ public class Util { @@ -92,7 +92,7 @@ public class Util {
}
public static Result<LocalDate> parseGermanDate(String s){
var match = GERMAN_DATE_PATTERN.matcher(s);
var match = GERMAN_DATE_PATTERN.matcher(" "+s+" ");
if (match.find()){
var day = Integer.parseInt(match.group(1));
var month = Integer.parseInt(match.group(2));

109
de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/CosmicDawn.java

@ -1,12 +1,17 @@ @@ -1,12 +1,17 @@
/* © SRSoftware 2024 */
package de.srsoftware.cal.importer.jena;
import static de.srsoftware.cal.Util.parseGermanDate;
import static de.srsoftware.cal.Util.parseGermanTime;
import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.CLASS;
import static de.srsoftware.tools.TagFilter.*;
import static java.nio.charset.StandardCharsets.UTF_8;
import de.srsoftware.cal.BaseImporter;
import de.srsoftware.cal.Util;
import de.srsoftware.cal.api.Coords;
import de.srsoftware.tools.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@ -16,11 +21,16 @@ import java.nio.file.Files; @@ -16,11 +21,16 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.time.LocalDate;
import java.time.LocalTime;
import java.util.DuplicateFormatFlagsException;
import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public abstract class CosmicDawn extends BaseImporter {
public class CosmicDawn extends BaseImporter {
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d\\d?).(\\d\\d?).(\\d{4}).*(\\d\\d?):(\\d\\d?)");
private static final String DEFAULT_LOCATION = "Cosmic Dawn e.V., Spitzweidenweg 28, 07743 Jena";
private static final Coords DEFAULT_COORDS = new Coords(50.93663, 11.59254);
public CosmicDawn() throws NoSuchAlgorithmException {
super();
@ -32,20 +42,45 @@ public abstract class CosmicDawn extends BaseImporter { @@ -32,20 +42,45 @@ public abstract class CosmicDawn extends BaseImporter {
}
@Override
protected Result<Tag> extractDescriptionTag(Tag eventTag) {
var list = eventTag.find(attributeEndsWith("class","event-body-content"));
return list.isEmpty() ? error("failed to find <div class=\"…event-body-content\">") : Payload.of(list.getFirst());
public String description() {
return "Importiert Events des Kulturbahnhofs in Jena";
}
protected Result<Tag> extractEndTag(Tag eventTag) {
@Override
protected Predicate<Tag> extractAttachmentsFilter() {
return attributeContains(CLASS,"single-event-page");
}
@Override
protected Predicate<Tag> extractDescriptionFilter() {
return attributeContains(CLASS,"event-body-content");
}
@Override
protected Result<Coords> extractCoords(Tag eventTag) {
return Payload.of(DEFAULT_COORDS);
}
@Override
protected Predicate<Tag> extractEndDateFilter() {
return null;
}
@Override
protected Predicate<Tag> extractEndTimeFilter() {
return null;
}
protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
if (pageResult.optional().isEmpty()) return transform(pageResult);
List<Tag> list = pageResult.optional().get().find(attributeEquals("class", "inside-article"));
return (list.isEmpty()) ? error("Failed to find <div class=\"inside-article\">!") : Payload.of(list.getFirst());
var res = super.extractEventTag(pageResult);
// remove youtube embeddings
if (res instanceof Payload<Tag> payload) payload.get().find(attributeContains(CLASS,"youtube")).forEach(Tag::remove);
return res;
}
@Override
protected Predicate<Tag> extractEventTagFilter() {
return attributeEquals(CLASS,"inside-article");
}
@Override
@ -65,33 +100,39 @@ public abstract class CosmicDawn extends BaseImporter { @@ -65,33 +100,39 @@ public abstract class CosmicDawn extends BaseImporter {
return Payload.of(urlList);
}
@Override
protected Predicate<Tag> extractLinksFilter() {
return attributeContains(CLASS,"single-event-wrapper");
}
@Override
protected Result<String> extractLocation(Tag eventTag) {
return Payload.of(DEFAULT_LOCATION);
}
@Override
protected Result<Tag> extractLocationTag(Tag eventTag) {
protected Predicate<Tag> extractLocationFilter() {
return null;
}
protected Result<Tag> extractStartTag(Tag eventTag) {
var dateTags = eventTag.find(attributeContains("class","event-date-time"));
if (dateTags.isEmpty()) return error("Start date not found!");
var times = eventTag.find(attributeEquals("class","event_time")).stream()
.flatMap(tag -> tag.find(IS_SPAN).stream())
.filter(tag -> tag.toString().contains("Begin"))
.toList();
if (times.isEmpty()) return error("Start time not found!");
var div = Tag.of("div").add(dateTags.getFirst()).add(times.getFirst());
return Payload.of(div);
@Override
protected Predicate<Tag> extractStartDateFilter() {
return attributeEquals("itemprop","startDate");
}
@Override
protected Predicate<Tag> extractStartTimeFilter() {
return attributeEquals(CLASS,"event_time");
}
@Override
protected List<String> extractTags(Tag eventTag) {
return List.of();
return List.of("Kulturbahnhof","Jena","CosmicDawn");
}
@Override
protected Result<Tag> extractTitleTag(Tag eventTag) {
var list = eventTag.find(ofType("h1"));
return list.isEmpty() ? error("failed to find <h1>") : Payload.of(list.getFirst());
protected Predicate<Tag> extractTitleFilter() {
return ofType("h1");
}
@Override
@ -100,19 +141,21 @@ public abstract class CosmicDawn extends BaseImporter { @@ -100,19 +141,21 @@ public abstract class CosmicDawn extends BaseImporter {
}
@Override
protected Result<LocalDate> parseStartDate(String date) {
var matcher = START_DATE_PATTERN.matcher(date);
if (matcher.find()){
int day = Integer.parseInt(matcher.group(1));
int mon = Integer.parseInt(matcher.group(2));
int year= Integer.parseInt(matcher.group(3));
int hour = Integer.parseInt(matcher.group(4));
int min = Integer.parseInt(matcher.group(5));
}
protected Result<LocalTime> parseEndTime(String string) {
return null;
}
/**
@Override
protected Result<LocalDate> parseStartDate(String string) {
return parseGermanDate(string);
}
@Override
protected Result<LocalTime> parseStartTime(String string) {
return parseGermanTime(string);
}
/**
* Die Kuba-Seite haut einen haufen Script mit raus, der dazu führt, dass die Tags nicht richtig geparsed werden.
* Also schneiden wir den kompletten header ab...
* @param inputStream eingehender InputStream, verpackt in Result

2
de.srsoftware.cal.web/src/main/resources/script/event.js

@ -8,7 +8,7 @@ function attachmentList(json){ @@ -8,7 +8,7 @@ function attachmentList(json){
var attachments = document.getElementById('attachments');
attachments.innerHTML = '';
for (var attachment of json){
if (attachment.mime.startsWith('image')){
if (attachment.mime.startsWith('image')||attachment.url.includes('.jpg')||attachment.url.includes('.png')||attachment.url.includes('.gif')){
var img = document.createElement('img');
img.src = attachment.url;
attachments.appendChild(img);

Loading…
Cancel
Save