Commit 123389b5 authored by Gradl, Tobias's avatar Gradl, Tobias
Browse files

420: Implement configurable crawls -> [GS: Repetitive Crawl Model]

(OPENED)

Task-Url: #420
parent 2dfbcb90
......@@ -21,7 +21,7 @@ allprojects {
coreVersion = "6.1-SNAPSHOT"
gtfVersion = "2.0.0-SNAPSHOT"
processingVersion = "4.1.0-SNAPSHOT"
colregModelVersion = "4.3.2-RELEASE"
colregModelVersion = "4.3.4-RELEASE"
dariahSpVersion = "2.1.4-SNAPSHOT"
jsonAssertVersion = "1.5.0"
......
......@@ -182,6 +182,8 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
if (cCurrent.getEndpoints()!=null) {
for (Endpoint eCurrent : cCurrent.getEndpoints()) {
if (this.endpointsAreSame(eCurrent, eFetched)) {
eCurrent.setAccessModelId(eFetched.getAccessModelId());
mergeOrUnchanged = true;
this.mergeDatamodelReferences(eCurrent, eFetched);
deleteEndpoints.remove(eCurrent);
......@@ -274,7 +276,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
Period shortestPeriod = null;
Period updatePeriod;
for (AccrualPojo accrual : fetchedCollection.getAccrualPojos()) {
if (this.knownUpdatePolicies.contains(accrual.getAccrualPolicy())) {
if (this.knownUpdatePolicies != null && this.knownUpdatePolicies.contains(accrual.getAccrualPolicy())) {
updatePeriod = this.updateFrequencyMap.get(accrual.getAccrualPeriodicity());
if (updatePeriod==null) {
updatePeriod = this.updateFrequencyMap.get(defaultUnclosedFrequencyKey);
......@@ -346,6 +348,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
}
}
e.setAccessModelId(accessPojo.getAccessModelId());
e.setUrl(accessPojo.getUri());
e.setPatterns(accessPojo.getPatterns());
......
......@@ -10,6 +10,7 @@ public class DatamodelConfigProperties {
private String presentation;
private String indexing;
private String metadata;
private String crawling;
private String oaidcModel;
private List<String> modelsWithMessageCodes;
private String modelsMessageCodePrefix;
......
package eu.dariah.de.search.crawling.crawler;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import eu.dariah.de.search.config.MainConfigProperties;
import eu.dariah.de.search.crawling.files.FileDownloader;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.model.ExtendedMappingContainer;
import eu.dariah.de.search.service.DatamodelService;
import eu.dariah.de.search.service.MappingService;
import lombok.Getter;
import lombok.Setter;
public class RepetitiveFileCrawlerImpl extends FileDownloader implements InitializingBean {
@Autowired private MainConfigProperties mainConfig;
@Autowired protected DatamodelService datamodelService;
@Autowired private MappingService mappingService;
@Getter @Setter
Map<String, String> fileProcessingServiceMap;
private List<String> handledUrls;
private ExtendedMappingContainer mapping = null;
// 1. Use downloader to get first file
// 2. Get a mapping
// 3. Execute the mapping
@Override
public String getUnitMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.file.crawler.unit";
}
@Override
public String getTitleMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.file.crawler.title";
}
@Override
public void afterPropertiesSet() throws Exception {
this.fileName = UUID.randomUUID().toString();
}
@Override
public void init(Endpoint endpoint, Crawl crawl, ExtendedDatamodelContainer sc) {
super.init(endpoint, crawl, sc);
endpoint.getFileType();
endpoint.getAccessModelId();
this.handledUrls = new ArrayList<>();
if (mainConfig.getDatamodels().getCrawling()==null) {
logger.warn("No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable");
} else {
if (endpoint.getAccessModelId()!=null) {
logger.info("Dedicated access modell configured: {}", endpoint.getAccessModelId());
mapping = mappingService.getMappingBySourceAndTarget(endpoint.getAccessModelId(), mainConfig.getDatamodels().getCrawling());
} else {
logger.info("No dedicated access modell, using datamodel: {}", sc.getModel().getId());
mapping = mappingService.getMappingBySourceAndTarget(sc.getModel().getId(), mainConfig.getDatamodels().getCrawling());
}
if (mapping==null) {
logger.info("No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured");
}
}
}
@Override
public void downloadFile() {
super.downloadFile();
File f = new File(this.getOutputPath());
if (f.exists()) {
logger.debug("file exists: {}", f.getAbsolutePath());
}
}
@Override
protected String getOutputFilename() {
return fileName + "." + (handledUrls==null ? 0 : handledUrls.size());
}
}
......@@ -11,8 +11,6 @@ import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import org.springframework.beans.factory.annotation.Value;
import de.unibamberg.minf.processing.exception.ResourceProcessingException;
import eu.dariah.de.search.crawling.crawler.Crawler;
import eu.dariah.de.search.model.Crawl;
......@@ -21,15 +19,14 @@ import eu.dariah.de.search.model.ExtendedDatamodelContainer;
public class FileDownloader extends BaseFileStreamCrawler implements Crawler {
@Value("${processing.download.filename:download.tmp}")
private String fileName;
protected String fileName;
private URI inputURI;
protected URI inputURI;
private int bufferSize = 1024;
private int chunkSize = 1048576;
private boolean initialized = false;
protected boolean initialized = false;
public int getBufferSize() { return bufferSize; }
......
......@@ -27,6 +27,8 @@ public class Endpoint implements Identifiable {
private String url;
private String accessType;
private String accessModelId;
private String fileType;
private String dateTimeFormatPattern;
......
......@@ -27,6 +27,7 @@ import eu.dariah.de.search.crawling.crawler.FileProcessor;
import eu.dariah.de.search.crawling.crawler.GitCrawlerImpl;
import eu.dariah.de.search.crawling.crawler.IndexCleaner;
import eu.dariah.de.search.crawling.crawler.OaiPmhCrawlerImpl;
import eu.dariah.de.search.crawling.crawler.RepetitiveFileCrawlerImpl;
import eu.dariah.de.search.crawling.files.FileDownloader;
import eu.dariah.de.search.crawling.files.FileUnarchiver;
import eu.dariah.de.search.crawling.files.FileUnpacker;
......@@ -63,7 +64,7 @@ public class CrawlingConfig extends CrawlingConfigProperties {
Map<String, String> accessChainMap = new HashMap<>();
accessChainMap.put("OAI-PMH", "oaiPmhCrawler");
accessChainMap.put("Git Repository", "gitCrawler");
accessChainMap.put("Online file", "fileDownloader");
accessChainMap.put("Online file", "fileCrawler");
crawlManager.setAccessChains(accessChainMap);
Map<String, String> fileProcessingChainMap = new HashMap<>();
......@@ -105,8 +106,17 @@ public class CrawlingConfig extends CrawlingConfigProperties {
@Bean
@Scope("prototype")
public FileDownloader fileDownloader() {
return new FileDownloader();
public RepetitiveFileCrawlerImpl fileCrawler() {
RepetitiveFileCrawlerImpl fileCrawler = new RepetitiveFileCrawlerImpl();
Map<String, String> fileProcessingServiceMap = new HashMap<>();
fileProcessingServiceMap.put("XML", "xmlStringProcessor");
fileProcessingServiceMap.put("JSON", "jsonProcessingService");
fileProcessingServiceMap.put("CSV", "csvStringProcessor");
fileProcessingServiceMap.put("TSV", "tsvStringProcessor");
fileProcessingServiceMap.put("TEXT", "textStringProcessor");
fileCrawler.setFileProcessingServiceMap(fileProcessingServiceMap);
return fileCrawler;
}
@Bean
......@@ -233,4 +243,37 @@ public class CrawlingConfig extends CrawlingConfigProperties {
fileProcessingAntiPatterns.add(".git/**");
return fileProcessingAntiPatterns;
}
@Bean
public Map<String, String> updateFrequencyMap() {
Map<String, String> updateFrequencyMap = new HashMap<>();
updateFrequencyMap.put("http://purl.org/cld/freq/triennial", "P3M");
updateFrequencyMap.put("http://purl.org/cld/freq/biennial", "P3M");
updateFrequencyMap.put("http://purl.org/cld/freq/annual", "P3M");
updateFrequencyMap.put("http://purl.org/cld/freq/semiannual", "P2M");
updateFrequencyMap.put("http://purl.org/cld/freq/threeTimesAYear", "P2M");
updateFrequencyMap.put("http://purl.org/cld/freq/quarterly", "P1M");
updateFrequencyMap.put("http://purl.org/cld/freq/bimonthly", "P1M");
updateFrequencyMap.put("http://purl.org/cld/freq/monthly", "P2W");
updateFrequencyMap.put("http://purl.org/cld/freq/semimonthly", "P2W");
updateFrequencyMap.put("http://purl.org/cld/freq/biweekly", "P2W");
updateFrequencyMap.put("http://purl.org/cld/freq/threeTimesAMonth", "P1W");
updateFrequencyMap.put("http://purl.org/cld/freq/weekly", "P1W");
updateFrequencyMap.put("http://purl.org/cld/freq/semiweekly", "P1W");
updateFrequencyMap.put("http://purl.org/cld/freq/threeTimesAWeek", "P3D");
updateFrequencyMap.put("http://purl.org/cld/freq/daily", "P3D");
updateFrequencyMap.put("http://purl.org/cld/freq/continuous", "P3D");
updateFrequencyMap.put("http://purl.org/cld/freq/completelyIrregular", "P3D");
updateFrequencyMap.put("_defaultUnclosed", "P1M");
return updateFrequencyMap;
}
@Bean
public List<String> knownUpdatePolicies() {
List<String> knownUpdatePolicies = new ArrayList<>();
knownUpdatePolicies.add("http://purl.org/cld/accpol/passive");
knownUpdatePolicies.add("http://purl.org/cld/accpol/active");
knownUpdatePolicies.add("http://purl.org/cld/accpol/partial");
return knownUpdatePolicies;
}
}
......@@ -47,6 +47,7 @@ datamodels:
indexing: 5a27ab9b0246440d479c6c46
presentation: 5b9004394c7b0405ccf90b00
metadata: 5cd3f505dd960f585e960f63
crawling: 60812e54667bd50488d2b237
# modelsWithMessageCodes:
# - 598f06e706bffc03a8e44f82
# - 598f06e706bffc03a8e44f83
......
......@@ -11,7 +11,7 @@ includeBuild('../colreg') {
}
}*/
/*includeBuild('../processing') {
includeBuild('../processing') {
dependencySubstitution {
substitute module('de.unibamberg.minf.processing:processing-core') with project(':processing-core')
substitute module('de.unibamberg.minf.processing:processing-adapters') with project(':processing-adapters')
......@@ -35,4 +35,4 @@ includeBuild('../core') {
dependencySubstitution {
substitute module('de.unibamberg.minf.core:core-web') with project(':core-web')
}
}*/
\ No newline at end of file
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment