Browse Source

refactored CosmicDawn importer - now working!

Signed-off-by: Stephan Richter <s.richter@srsoftware.de>
main
Stephan Richter 6 months ago
parent
commit
e0dde9aa9e
  1. 2
      de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/Util.java
  2. 109
      de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/CosmicDawn.java
  3. 2
      de.srsoftware.cal.web/src/main/resources/script/event.js

2
de.srsoftware.cal.base/src/main/java/de/srsoftware/cal/Util.java

@ -92,7 +92,7 @@ public class Util {
} }
public static Result<LocalDate> parseGermanDate(String s){ public static Result<LocalDate> parseGermanDate(String s){
var match = GERMAN_DATE_PATTERN.matcher(s); var match = GERMAN_DATE_PATTERN.matcher(" "+s+" ");
if (match.find()){ if (match.find()){
var day = Integer.parseInt(match.group(1)); var day = Integer.parseInt(match.group(1));
var month = Integer.parseInt(match.group(2)); var month = Integer.parseInt(match.group(2));

109
de.srsoftware.cal.importer/src/main/java/de/srsoftware/cal/importer/jena/CosmicDawn.java

@ -1,12 +1,17 @@
/* © SRSoftware 2024 */ /* © SRSoftware 2024 */
package de.srsoftware.cal.importer.jena; package de.srsoftware.cal.importer.jena;
import static de.srsoftware.cal.Util.parseGermanDate;
import static de.srsoftware.cal.Util.parseGermanTime;
import static de.srsoftware.tools.Error.error; import static de.srsoftware.tools.Error.error;
import static de.srsoftware.tools.Result.transform; import static de.srsoftware.tools.Result.transform;
import static de.srsoftware.tools.Tag.CLASS;
import static de.srsoftware.tools.TagFilter.*; import static de.srsoftware.tools.TagFilter.*;
import static java.nio.charset.StandardCharsets.UTF_8; import static java.nio.charset.StandardCharsets.UTF_8;
import de.srsoftware.cal.BaseImporter; import de.srsoftware.cal.BaseImporter;
import de.srsoftware.cal.Util;
import de.srsoftware.cal.api.Coords;
import de.srsoftware.tools.*; import de.srsoftware.tools.*;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
@ -16,11 +21,16 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.LocalTime;
import java.util.DuplicateFormatFlagsException;
import java.util.List; import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
public abstract class CosmicDawn extends BaseImporter { public class CosmicDawn extends BaseImporter {
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d\\d?).(\\d\\d?).(\\d{4}).*(\\d\\d?):(\\d\\d?)"); private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d\\d?).(\\d\\d?).(\\d{4}).*(\\d\\d?):(\\d\\d?)");
private static final String DEFAULT_LOCATION = "Cosmic Dawn e.V., Spitzweidenweg 28, 07743 Jena";
private static final Coords DEFAULT_COORDS = new Coords(50.93663, 11.59254);
public CosmicDawn() throws NoSuchAlgorithmException { public CosmicDawn() throws NoSuchAlgorithmException {
super(); super();
@ -32,20 +42,45 @@ public abstract class CosmicDawn extends BaseImporter {
} }
@Override @Override
protected Result<Tag> extractDescriptionTag(Tag eventTag) { public String description() {
var list = eventTag.find(attributeEndsWith("class","event-body-content")); return "Importiert Events des Kulturbahnhofs in Jena";
return list.isEmpty() ? error("failed to find <div class=\"…event-body-content\">") : Payload.of(list.getFirst());
} }
protected Result<Tag> extractEndTag(Tag eventTag) { @Override
protected Predicate<Tag> extractAttachmentsFilter() {
return attributeContains(CLASS,"single-event-page");
}
@Override
protected Predicate<Tag> extractDescriptionFilter() {
return attributeContains(CLASS,"event-body-content");
}
@Override
protected Result<Coords> extractCoords(Tag eventTag) {
return Payload.of(DEFAULT_COORDS);
}
@Override
protected Predicate<Tag> extractEndDateFilter() {
return null; return null;
} }
@Override @Override
protected Predicate<Tag> extractEndTimeFilter() {
return null;
}
protected Result<Tag> extractEventTag(Result<Tag> pageResult) { protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
if (pageResult.optional().isEmpty()) return transform(pageResult); var res = super.extractEventTag(pageResult);
List<Tag> list = pageResult.optional().get().find(attributeEquals("class", "inside-article")); // remove youtube embeddings
return (list.isEmpty()) ? error("Failed to find <div class=\"inside-article\">!") : Payload.of(list.getFirst()); if (res instanceof Payload<Tag> payload) payload.get().find(attributeContains(CLASS,"youtube")).forEach(Tag::remove);
return res;
}
@Override
protected Predicate<Tag> extractEventTagFilter() {
return attributeEquals(CLASS,"inside-article");
} }
@Override @Override
@ -65,33 +100,39 @@ public abstract class CosmicDawn extends BaseImporter {
return Payload.of(urlList); return Payload.of(urlList);
} }
@Override
protected Predicate<Tag> extractLinksFilter() {
return attributeContains(CLASS,"single-event-wrapper");
}
@Override
protected Result<String> extractLocation(Tag eventTag) {
return Payload.of(DEFAULT_LOCATION);
}
@Override @Override
protected Result<Tag> extractLocationTag(Tag eventTag) { protected Predicate<Tag> extractLocationFilter() {
return null; return null;
} }
protected Result<Tag> extractStartTag(Tag eventTag) { @Override
var dateTags = eventTag.find(attributeContains("class","event-date-time")); protected Predicate<Tag> extractStartDateFilter() {
if (dateTags.isEmpty()) return error("Start date not found!"); return attributeEquals("itemprop","startDate");
var times = eventTag.find(attributeEquals("class","event_time")).stream() }
.flatMap(tag -> tag.find(IS_SPAN).stream())
.filter(tag -> tag.toString().contains("Begin")) @Override
.toList(); protected Predicate<Tag> extractStartTimeFilter() {
if (times.isEmpty()) return error("Start time not found!"); return attributeEquals(CLASS,"event_time");
var div = Tag.of("div").add(dateTags.getFirst()).add(times.getFirst());
return Payload.of(div);
} }
@Override @Override
protected List<String> extractTags(Tag eventTag) { protected List<String> extractTags(Tag eventTag) {
return List.of(); return List.of("Kulturbahnhof","Jena","CosmicDawn");
} }
@Override @Override
protected Result<Tag> extractTitleTag(Tag eventTag) { protected Predicate<Tag> extractTitleFilter() {
var list = eventTag.find(ofType("h1")); return ofType("h1");
return list.isEmpty() ? error("failed to find <h1>") : Payload.of(list.getFirst());
} }
@Override @Override
@ -100,19 +141,21 @@ public abstract class CosmicDawn extends BaseImporter {
} }
@Override @Override
protected Result<LocalDate> parseStartDate(String date) { protected Result<LocalTime> parseEndTime(String string) {
var matcher = START_DATE_PATTERN.matcher(date);
if (matcher.find()){
int day = Integer.parseInt(matcher.group(1));
int mon = Integer.parseInt(matcher.group(2));
int year= Integer.parseInt(matcher.group(3));
int hour = Integer.parseInt(matcher.group(4));
int min = Integer.parseInt(matcher.group(5));
}
return null; return null;
} }
/** @Override
protected Result<LocalDate> parseStartDate(String string) {
return parseGermanDate(string);
}
@Override
protected Result<LocalTime> parseStartTime(String string) {
return parseGermanTime(string);
}
/**
* Die Kuba-Seite haut einen haufen Script mit raus, der dazu führt, dass die Tags nicht richtig geparsed werden. * Die Kuba-Seite haut einen haufen Script mit raus, der dazu führt, dass die Tags nicht richtig geparsed werden.
* Also schneiden wir den kompletten header ab... * Also schneiden wir den kompletten header ab...
* @param inputStream eingehender InputStream, verpackt in Result * @param inputStream eingehender InputStream, verpackt in Result

2
de.srsoftware.cal.web/src/main/resources/script/event.js

@ -8,7 +8,7 @@ function attachmentList(json){
var attachments = document.getElementById('attachments'); var attachments = document.getElementById('attachments');
attachments.innerHTML = ''; attachments.innerHTML = '';
for (var attachment of json){ for (var attachment of json){
if (attachment.mime.startsWith('image')){ if (attachment.mime.startsWith('image')||attachment.url.includes('.jpg')||attachment.url.includes('.png')||attachment.url.includes('.gif')){
var img = document.createElement('img'); var img = document.createElement('img');
img.src = attachment.url; img.src = attachment.url;
attachments.appendChild(img); attachments.appendChild(img);

Loading…
Cancel
Save