refactored CosmicDawn importer - now working!
Signed-off-by: Stephan Richter <s.richter@srsoftware.de>
This commit is contained in:
@@ -92,7 +92,7 @@ public class Util {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static Result<LocalDate> parseGermanDate(String s){
|
public static Result<LocalDate> parseGermanDate(String s){
|
||||||
var match = GERMAN_DATE_PATTERN.matcher(s);
|
var match = GERMAN_DATE_PATTERN.matcher(" "+s+" ");
|
||||||
if (match.find()){
|
if (match.find()){
|
||||||
var day = Integer.parseInt(match.group(1));
|
var day = Integer.parseInt(match.group(1));
|
||||||
var month = Integer.parseInt(match.group(2));
|
var month = Integer.parseInt(match.group(2));
|
||||||
|
|||||||
@@ -1,12 +1,17 @@
|
|||||||
/* © SRSoftware 2024 */
|
/* © SRSoftware 2024 */
|
||||||
package de.srsoftware.cal.importer.jena;
|
package de.srsoftware.cal.importer.jena;
|
||||||
|
|
||||||
|
import static de.srsoftware.cal.Util.parseGermanDate;
|
||||||
|
import static de.srsoftware.cal.Util.parseGermanTime;
|
||||||
import static de.srsoftware.tools.Error.error;
|
import static de.srsoftware.tools.Error.error;
|
||||||
import static de.srsoftware.tools.Result.transform;
|
import static de.srsoftware.tools.Result.transform;
|
||||||
|
import static de.srsoftware.tools.Tag.CLASS;
|
||||||
import static de.srsoftware.tools.TagFilter.*;
|
import static de.srsoftware.tools.TagFilter.*;
|
||||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||||
|
|
||||||
import de.srsoftware.cal.BaseImporter;
|
import de.srsoftware.cal.BaseImporter;
|
||||||
|
import de.srsoftware.cal.Util;
|
||||||
|
import de.srsoftware.cal.api.Coords;
|
||||||
import de.srsoftware.tools.*;
|
import de.srsoftware.tools.*;
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
@@ -16,11 +21,16 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
|
import java.time.LocalTime;
|
||||||
|
import java.util.DuplicateFormatFlagsException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public abstract class CosmicDawn extends BaseImporter {
|
public class CosmicDawn extends BaseImporter {
|
||||||
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d\\d?).(\\d\\d?).(\\d{4}).*(\\d\\d?):(\\d\\d?)");
|
private static final Pattern START_DATE_PATTERN = Pattern.compile("(\\d\\d?).(\\d\\d?).(\\d{4}).*(\\d\\d?):(\\d\\d?)");
|
||||||
|
private static final String DEFAULT_LOCATION = "Cosmic Dawn e.V., Spitzweidenweg 28, 07743 Jena";
|
||||||
|
private static final Coords DEFAULT_COORDS = new Coords(50.93663, 11.59254);
|
||||||
|
|
||||||
public CosmicDawn() throws NoSuchAlgorithmException {
|
public CosmicDawn() throws NoSuchAlgorithmException {
|
||||||
super();
|
super();
|
||||||
@@ -32,20 +42,45 @@ public abstract class CosmicDawn extends BaseImporter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Result<Tag> extractDescriptionTag(Tag eventTag) {
|
public String description() {
|
||||||
var list = eventTag.find(attributeEndsWith("class","event-body-content"));
|
return "Importiert Events des Kulturbahnhofs in Jena";
|
||||||
return list.isEmpty() ? error("failed to find <div class=\"…event-body-content\">") : Payload.of(list.getFirst());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Result<Tag> extractEndTag(Tag eventTag) {
|
@Override
|
||||||
|
protected Predicate<Tag> extractAttachmentsFilter() {
|
||||||
|
return attributeContains(CLASS,"single-event-page");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Predicate<Tag> extractDescriptionFilter() {
|
||||||
|
return attributeContains(CLASS,"event-body-content");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Result<Coords> extractCoords(Tag eventTag) {
|
||||||
|
return Payload.of(DEFAULT_COORDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Predicate<Tag> extractEndDateFilter() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
protected Predicate<Tag> extractEndTimeFilter() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
|
protected Result<Tag> extractEventTag(Result<Tag> pageResult) {
|
||||||
if (pageResult.optional().isEmpty()) return transform(pageResult);
|
var res = super.extractEventTag(pageResult);
|
||||||
List<Tag> list = pageResult.optional().get().find(attributeEquals("class", "inside-article"));
|
// remove youtube embeddings
|
||||||
return (list.isEmpty()) ? error("Failed to find <div class=\"inside-article\">!") : Payload.of(list.getFirst());
|
if (res instanceof Payload<Tag> payload) payload.get().find(attributeContains(CLASS,"youtube")).forEach(Tag::remove);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Predicate<Tag> extractEventTagFilter() {
|
||||||
|
return attributeEquals(CLASS,"inside-article");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -65,33 +100,39 @@ public abstract class CosmicDawn extends BaseImporter {
|
|||||||
return Payload.of(urlList);
|
return Payload.of(urlList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Predicate<Tag> extractLinksFilter() {
|
||||||
|
return attributeContains(CLASS,"single-event-wrapper");
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Result<Tag> extractLocationTag(Tag eventTag) {
|
protected Result<String> extractLocation(Tag eventTag) {
|
||||||
|
return Payload.of(DEFAULT_LOCATION);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Predicate<Tag> extractLocationFilter() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Result<Tag> extractStartTag(Tag eventTag) {
|
@Override
|
||||||
var dateTags = eventTag.find(attributeContains("class","event-date-time"));
|
protected Predicate<Tag> extractStartDateFilter() {
|
||||||
if (dateTags.isEmpty()) return error("Start date not found!");
|
return attributeEquals("itemprop","startDate");
|
||||||
var times = eventTag.find(attributeEquals("class","event_time")).stream()
|
}
|
||||||
.flatMap(tag -> tag.find(IS_SPAN).stream())
|
|
||||||
.filter(tag -> tag.toString().contains("Begin"))
|
@Override
|
||||||
.toList();
|
protected Predicate<Tag> extractStartTimeFilter() {
|
||||||
if (times.isEmpty()) return error("Start time not found!");
|
return attributeEquals(CLASS,"event_time");
|
||||||
var div = Tag.of("div").add(dateTags.getFirst()).add(times.getFirst());
|
|
||||||
return Payload.of(div);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<String> extractTags(Tag eventTag) {
|
protected List<String> extractTags(Tag eventTag) {
|
||||||
return List.of();
|
return List.of("Kulturbahnhof","Jena","CosmicDawn");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Result<Tag> extractTitleTag(Tag eventTag) {
|
protected Predicate<Tag> extractTitleFilter() {
|
||||||
var list = eventTag.find(ofType("h1"));
|
return ofType("h1");
|
||||||
return list.isEmpty() ? error("failed to find <h1>") : Payload.of(list.getFirst());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -100,19 +141,21 @@ public abstract class CosmicDawn extends BaseImporter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Result<LocalDate> parseStartDate(String date) {
|
protected Result<LocalTime> parseEndTime(String string) {
|
||||||
var matcher = START_DATE_PATTERN.matcher(date);
|
|
||||||
if (matcher.find()){
|
|
||||||
int day = Integer.parseInt(matcher.group(1));
|
|
||||||
int mon = Integer.parseInt(matcher.group(2));
|
|
||||||
int year= Integer.parseInt(matcher.group(3));
|
|
||||||
int hour = Integer.parseInt(matcher.group(4));
|
|
||||||
int min = Integer.parseInt(matcher.group(5));
|
|
||||||
}
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
@Override
|
||||||
|
protected Result<LocalDate> parseStartDate(String string) {
|
||||||
|
return parseGermanDate(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Result<LocalTime> parseStartTime(String string) {
|
||||||
|
return parseGermanTime(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
* Die Kuba-Seite haut einen haufen Script mit raus, der dazu führt, dass die Tags nicht richtig geparsed werden.
|
* Die Kuba-Seite haut einen haufen Script mit raus, der dazu führt, dass die Tags nicht richtig geparsed werden.
|
||||||
* Also schneiden wir den kompletten header ab...
|
* Also schneiden wir den kompletten header ab...
|
||||||
* @param inputStream eingehender InputStream, verpackt in Result
|
* @param inputStream eingehender InputStream, verpackt in Result
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ function attachmentList(json){
|
|||||||
var attachments = document.getElementById('attachments');
|
var attachments = document.getElementById('attachments');
|
||||||
attachments.innerHTML = '';
|
attachments.innerHTML = '';
|
||||||
for (var attachment of json){
|
for (var attachment of json){
|
||||||
if (attachment.mime.startsWith('image')){
|
if (attachment.mime.startsWith('image')||attachment.url.includes('.jpg')||attachment.url.includes('.png')||attachment.url.includes('.gif')){
|
||||||
var img = document.createElement('img');
|
var img = document.createElement('img');
|
||||||
img.src = attachment.url;
|
img.src = attachment.url;
|
||||||
attachments.appendChild(img);
|
attachments.appendChild(img);
|
||||||
|
|||||||
Reference in New Issue
Block a user