Commit 123389b5 authored by Gradl, Tobias's avatar Gradl, Tobias
Browse files

420: Implement configurable crawls -> [GS: Repetitive Crawl Model]

(OPENED)

Task-Url: #420
parent 2dfbcb90
...@@ -21,7 +21,7 @@ allprojects { ...@@ -21,7 +21,7 @@ allprojects {
coreVersion = "6.1-SNAPSHOT" coreVersion = "6.1-SNAPSHOT"
gtfVersion = "2.0.0-SNAPSHOT" gtfVersion = "2.0.0-SNAPSHOT"
processingVersion = "4.1.0-SNAPSHOT" processingVersion = "4.1.0-SNAPSHOT"
colregModelVersion = "4.3.2-RELEASE" colregModelVersion = "4.3.4-RELEASE"
dariahSpVersion = "2.1.4-SNAPSHOT" dariahSpVersion = "2.1.4-SNAPSHOT"
jsonAssertVersion = "1.5.0" jsonAssertVersion = "1.5.0"
......
...@@ -182,6 +182,8 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E ...@@ -182,6 +182,8 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
if (cCurrent.getEndpoints()!=null) { if (cCurrent.getEndpoints()!=null) {
for (Endpoint eCurrent : cCurrent.getEndpoints()) { for (Endpoint eCurrent : cCurrent.getEndpoints()) {
if (this.endpointsAreSame(eCurrent, eFetched)) { if (this.endpointsAreSame(eCurrent, eFetched)) {
eCurrent.setAccessModelId(eFetched.getAccessModelId());
mergeOrUnchanged = true; mergeOrUnchanged = true;
this.mergeDatamodelReferences(eCurrent, eFetched); this.mergeDatamodelReferences(eCurrent, eFetched);
deleteEndpoints.remove(eCurrent); deleteEndpoints.remove(eCurrent);
...@@ -274,7 +276,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E ...@@ -274,7 +276,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
Period shortestPeriod = null; Period shortestPeriod = null;
Period updatePeriod; Period updatePeriod;
for (AccrualPojo accrual : fetchedCollection.getAccrualPojos()) { for (AccrualPojo accrual : fetchedCollection.getAccrualPojos()) {
if (this.knownUpdatePolicies.contains(accrual.getAccrualPolicy())) { if (this.knownUpdatePolicies != null && this.knownUpdatePolicies.contains(accrual.getAccrualPolicy())) {
updatePeriod = this.updateFrequencyMap.get(accrual.getAccrualPeriodicity()); updatePeriod = this.updateFrequencyMap.get(accrual.getAccrualPeriodicity());
if (updatePeriod==null) { if (updatePeriod==null) {
updatePeriod = this.updateFrequencyMap.get(defaultUnclosedFrequencyKey); updatePeriod = this.updateFrequencyMap.get(defaultUnclosedFrequencyKey);
...@@ -346,6 +348,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E ...@@ -346,6 +348,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
} }
} }
e.setAccessModelId(accessPojo.getAccessModelId());
e.setUrl(accessPojo.getUri()); e.setUrl(accessPojo.getUri());
e.setPatterns(accessPojo.getPatterns()); e.setPatterns(accessPojo.getPatterns());
...@@ -394,7 +397,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E ...@@ -394,7 +397,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
return false; return false;
} }
} }
private boolean datasetsAreSame(Dataset ds1, Dataset ds2) { private boolean datasetsAreSame(Dataset ds1, Dataset ds2) {
if (!ds1.getId().equals(ds2.getId())) { if (!ds1.getId().equals(ds2.getId())) {
return false; return false;
......
...@@ -10,6 +10,7 @@ public class DatamodelConfigProperties { ...@@ -10,6 +10,7 @@ public class DatamodelConfigProperties {
private String presentation; private String presentation;
private String indexing; private String indexing;
private String metadata; private String metadata;
private String crawling;
private String oaidcModel; private String oaidcModel;
private List<String> modelsWithMessageCodes; private List<String> modelsWithMessageCodes;
private String modelsMessageCodePrefix; private String modelsMessageCodePrefix;
......
package eu.dariah.de.search.crawling.crawler;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import eu.dariah.de.search.config.MainConfigProperties;
import eu.dariah.de.search.crawling.files.FileDownloader;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.model.ExtendedMappingContainer;
import eu.dariah.de.search.service.DatamodelService;
import eu.dariah.de.search.service.MappingService;
import lombok.Getter;
import lombok.Setter;
public class RepetitiveFileCrawlerImpl extends FileDownloader implements InitializingBean {
@Autowired private MainConfigProperties mainConfig;
@Autowired protected DatamodelService datamodelService;
@Autowired private MappingService mappingService;
@Getter @Setter
Map<String, String> fileProcessingServiceMap;
private List<String> handledUrls;
private ExtendedMappingContainer mapping = null;
// 1. Use downloader to get first file
// 2. Get a mapping
// 3. Execute the mapping
@Override
public String getUnitMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.file.crawler.unit";
}
@Override
public String getTitleMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.file.crawler.title";
}
@Override
public void afterPropertiesSet() throws Exception {
this.fileName = UUID.randomUUID().toString();
}
@Override
public void init(Endpoint endpoint, Crawl crawl, ExtendedDatamodelContainer sc) {
super.init(endpoint, crawl, sc);
endpoint.getFileType();
endpoint.getAccessModelId();
this.handledUrls = new ArrayList<>();
if (mainConfig.getDatamodels().getCrawling()==null) {
logger.warn("No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable");
} else {
if (endpoint.getAccessModelId()!=null) {
logger.info("Dedicated access modell configured: {}", endpoint.getAccessModelId());
mapping = mappingService.getMappingBySourceAndTarget(endpoint.getAccessModelId(), mainConfig.getDatamodels().getCrawling());
} else {
logger.info("No dedicated access modell, using datamodel: {}", sc.getModel().getId());
mapping = mappingService.getMappingBySourceAndTarget(sc.getModel().getId(), mainConfig.getDatamodels().getCrawling());
}
if (mapping==null) {
logger.info("No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured");
}
}
}
@Override
public void downloadFile() {
super.downloadFile();
File f = new File(this.getOutputPath());
if (f.exists()) {
logger.debug("file exists: {}", f.getAbsolutePath());
}
}
@Override
protected String getOutputFilename() {
return fileName + "." + (handledUrls==null ? 0 : handledUrls.size());
}
}
...@@ -11,8 +11,6 @@ import java.net.URL; ...@@ -11,8 +11,6 @@ import java.net.URL;
import java.nio.channels.Channels; import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel; import java.nio.channels.ReadableByteChannel;
import org.springframework.beans.factory.annotation.Value;
import de.unibamberg.minf.processing.exception.ResourceProcessingException; import de.unibamberg.minf.processing.exception.ResourceProcessingException;
import eu.dariah.de.search.crawling.crawler.Crawler; import eu.dariah.de.search.crawling.crawler.Crawler;
import eu.dariah.de.search.model.Crawl; import eu.dariah.de.search.model.Crawl;
...@@ -21,15 +19,14 @@ import eu.dariah.de.search.model.ExtendedDatamodelContainer; ...@@ -21,15 +19,14 @@ import eu.dariah.de.search.model.ExtendedDatamodelContainer;
public class FileDownloader extends BaseFileStreamCrawler implements Crawler { public class FileDownloader extends BaseFileStreamCrawler implements Crawler {
@Value("${processing.download.filename:download.tmp}") protected String fileName;
private String fileName;
private URI inputURI; protected URI inputURI;
private int bufferSize = 1024; private int bufferSize = 1024;
private int chunkSize = 1048576; private int chunkSize = 1048576;
private boolean initialized = false; protected boolean initialized = false;
public int getBufferSize() { return bufferSize; } public int getBufferSize() { return bufferSize; }
......
...@@ -27,6 +27,8 @@ public class Endpoint implements Identifiable { ...@@ -27,6 +27,8 @@ public class Endpoint implements Identifiable {
private String url; private String url;
private String accessType; private String accessType;
private String accessModelId;
private String fileType; private String fileType;
private String dateTimeFormatPattern; private String dateTimeFormatPattern;
......
...@@ -27,6 +27,7 @@ import eu.dariah.de.search.crawling.crawler.FileProcessor; ...@@ -27,6 +27,7 @@ import eu.dariah.de.search.crawling.crawler.FileProcessor;
import eu.dariah.de.search.crawling.crawler.GitCrawlerImpl; import eu.dariah.de.search.crawling.crawler.GitCrawlerImpl;
import eu.dariah.de.search.crawling.crawler.IndexCleaner; import eu.dariah.de.search.crawling.crawler.IndexCleaner;
import eu.dariah.de.search.crawling.crawler.OaiPmhCrawlerImpl; import eu.dariah.de.search.crawling.crawler.OaiPmhCrawlerImpl;
import eu.dariah.de.search.crawling.crawler.RepetitiveFileCrawlerImpl;
import eu.dariah.de.search.crawling.files.FileDownloader; import eu.dariah.de.search.crawling.files.FileDownloader;
import eu.dariah.de.search.crawling.files.FileUnarchiver; import eu.dariah.de.search.crawling.files.FileUnarchiver;
import eu.dariah.de.search.crawling.files.FileUnpacker; import eu.dariah.de.search.crawling.files.FileUnpacker;
...@@ -63,7 +64,7 @@ public class CrawlingConfig extends CrawlingConfigProperties { ...@@ -63,7 +64,7 @@ public class CrawlingConfig extends CrawlingConfigProperties {
Map<String, String> accessChainMap = new HashMap<>(); Map<String, String> accessChainMap = new HashMap<>();
accessChainMap.put("OAI-PMH", "oaiPmhCrawler"); accessChainMap.put("OAI-PMH", "oaiPmhCrawler");
accessChainMap.put("Git Repository", "gitCrawler"); accessChainMap.put("Git Repository", "gitCrawler");
accessChainMap.put("Online file", "fileDownloader"); accessChainMap.put("Online file", "fileCrawler");
crawlManager.setAccessChains(accessChainMap); crawlManager.setAccessChains(accessChainMap);
Map<String, String> fileProcessingChainMap = new HashMap<>(); Map<String, String> fileProcessingChainMap = new HashMap<>();
...@@ -105,8 +106,17 @@ public class CrawlingConfig extends CrawlingConfigProperties { ...@@ -105,8 +106,17 @@ public class CrawlingConfig extends CrawlingConfigProperties {
@Bean @Bean
@Scope("prototype") @Scope("prototype")
public FileDownloader fileDownloader() { public RepetitiveFileCrawlerImpl fileCrawler() {
return new FileDownloader(); RepetitiveFileCrawlerImpl fileCrawler = new RepetitiveFileCrawlerImpl();
Map<String, String> fileProcessingServiceMap = new HashMap<>();
fileProcessingServiceMap.put("XML", "xmlStringProcessor");
fileProcessingServiceMap.put("JSON", "jsonProcessingService");
fileProcessingServiceMap.put("CSV", "csvStringProcessor");
fileProcessingServiceMap.put("TSV", "tsvStringProcessor");
fileProcessingServiceMap.put("TEXT", "textStringProcessor");
fileCrawler.setFileProcessingServiceMap(fileProcessingServiceMap);
return fileCrawler;
} }
@Bean @Bean
...@@ -233,4 +243,37 @@ public class CrawlingConfig extends CrawlingConfigProperties { ...@@ -233,4 +243,37 @@ public class CrawlingConfig extends CrawlingConfigProperties {
fileProcessingAntiPatterns.add(".git/**"); fileProcessingAntiPatterns.add(".git/**");
return fileProcessingAntiPatterns; return fileProcessingAntiPatterns;
} }
@Bean
public Map<String, String> updateFrequencyMap() {
Map<String, String> updateFrequencyMap = new HashMap<>();
updateFrequencyMap.put("http://purl.org/cld/freq/triennial", "P3M");
updateFrequencyMap.put("http://purl.org/cld/freq/biennial", "P3M");
updateFrequencyMap.put("http://purl.org/cld/freq/annual", "P3M");
updateFrequencyMap.put("http://purl.org/cld/freq/semiannual", "P2M");
updateFrequencyMap.put("http://purl.org/cld/freq/threeTimesAYear", "P2M");
updateFrequencyMap.put("http://purl.org/cld/freq/quarterly", "P1M");
updateFrequencyMap.put("http://purl.org/cld/freq/bimonthly", "P1M");
updateFrequencyMap.put("http://purl.org/cld/freq/monthly", "P2W");
updateFrequencyMap.put("http://purl.org/cld/freq/semimonthly", "P2W");
updateFrequencyMap.put("http://purl.org/cld/freq/biweekly", "P2W");
updateFrequencyMap.put("http://purl.org/cld/freq/threeTimesAMonth", "P1W");
updateFrequencyMap.put("http://purl.org/cld/freq/weekly", "P1W");
updateFrequencyMap.put("http://purl.org/cld/freq/semiweekly", "P1W");
updateFrequencyMap.put("http://purl.org/cld/freq/threeTimesAWeek", "P3D");
updateFrequencyMap.put("http://purl.org/cld/freq/daily", "P3D");
updateFrequencyMap.put("http://purl.org/cld/freq/continuous", "P3D");
updateFrequencyMap.put("http://purl.org/cld/freq/completelyIrregular", "P3D");
updateFrequencyMap.put("_defaultUnclosed", "P1M");
return updateFrequencyMap;
}
@Bean
public List<String> knownUpdatePolicies() {
List<String> knownUpdatePolicies = new ArrayList<>();
knownUpdatePolicies.add("http://purl.org/cld/accpol/passive");
knownUpdatePolicies.add("http://purl.org/cld/accpol/active");
knownUpdatePolicies.add("http://purl.org/cld/accpol/partial");
return knownUpdatePolicies;
}
} }
...@@ -47,6 +47,7 @@ datamodels: ...@@ -47,6 +47,7 @@ datamodels:
indexing: 5a27ab9b0246440d479c6c46 indexing: 5a27ab9b0246440d479c6c46
presentation: 5b9004394c7b0405ccf90b00 presentation: 5b9004394c7b0405ccf90b00
metadata: 5cd3f505dd960f585e960f63 metadata: 5cd3f505dd960f585e960f63
crawling: 60812e54667bd50488d2b237
# modelsWithMessageCodes: # modelsWithMessageCodes:
# - 598f06e706bffc03a8e44f82 # - 598f06e706bffc03a8e44f82
# - 598f06e706bffc03a8e44f83 # - 598f06e706bffc03a8e44f83
......
...@@ -11,7 +11,7 @@ includeBuild('../colreg') { ...@@ -11,7 +11,7 @@ includeBuild('../colreg') {
} }
}*/ }*/
/*includeBuild('../processing') { includeBuild('../processing') {
dependencySubstitution { dependencySubstitution {
substitute module('de.unibamberg.minf.processing:processing-core') with project(':processing-core') substitute module('de.unibamberg.minf.processing:processing-core') with project(':processing-core')
substitute module('de.unibamberg.minf.processing:processing-adapters') with project(':processing-adapters') substitute module('de.unibamberg.minf.processing:processing-adapters') with project(':processing-adapters')
...@@ -35,4 +35,4 @@ includeBuild('../core') { ...@@ -35,4 +35,4 @@ includeBuild('../core') {
dependencySubstitution { dependencySubstitution {
substitute module('de.unibamberg.minf.core:core-web') with project(':core-web') substitute module('de.unibamberg.minf.core:core-web') with project(':core-web')
} }
}*/ }
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment