Commit bd2d28bb authored by Gradl, Tobias's avatar Gradl, Tobias
Browse files

420: Implement configurable crawls -> [GS: Repetitive Crawl Model]

(OPENED)

Task-Url: #420
parent 123389b5
Pipeline #23335 failed with stage
in 9 seconds
......@@ -24,6 +24,7 @@ import eu.dariah.de.colreg.pojo.api.results.CollectionApiResultPojo;
import eu.dariah.de.search.Constants.AccessMethods;
import eu.dariah.de.search.api.client.base.BaseApiClientImpl;
import eu.dariah.de.search.config.MainConfigProperties;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.model.Collection;
import eu.dariah.de.search.model.Dataset;
import eu.dariah.de.search.model.Endpoint;
......@@ -392,7 +393,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
try {
return ep1.getAccessType().equals(ep2.getAccessType()) &&
ep1.getFileType().equals(ep2.getFileType()) &&
ep1.renderAccessUrl().equals(ep2.renderAccessUrl());
CrawlHelper.renderAccessUrl(ep1).equals(CrawlHelper.renderAccessUrl(ep2));
} catch (Exception e) {
return false;
}
......
......@@ -11,6 +11,7 @@ import org.springframework.context.ApplicationContextAware;
import de.unibamberg.minf.processing.git.adapter.GitRepositoryAdapter;
import de.unibamberg.minf.processing.git.service.GitRepositoryProcessingService;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
......@@ -56,7 +57,7 @@ public class GitCrawlerImpl extends GitRepositoryProcessingService implements Cr
@Override
public void init(Endpoint endpoint, Crawl crawl, ExtendedDatamodelContainer sc) {
this.setUrl(endpoint.renderAccessUrl());
this.setUrl(CrawlHelper.renderAccessUrl(endpoint));
this.setBranch(endpoint.getSingleParamValue("branch"));
this.crawlId = crawl.getId();
......
......@@ -11,6 +11,7 @@ import de.unibamberg.minf.processing.service.online.OaiPmhHarvestingService;
import eu.dariah.de.search.api.client.OaiPmhClient;
import eu.dariah.de.search.api.model.oaipmh.OaiPmhMetadataFormat;
import eu.dariah.de.search.api.model.oaipmh.OaiPmhResponseContainer;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
......@@ -47,7 +48,7 @@ public class OaiPmhCrawlerImpl extends OaiPmhHarvestingService implements Crawle
@Override
public void init(Endpoint endpoint, Crawl crawl, ExtendedDatamodelContainer sc) {
this.setUrl(endpoint.renderAccessUrl());
this.setUrl(CrawlHelper.renderAccessUrl(endpoint));
this.setSet(endpoint.getSingleParamValue("set"));
this.crawlId = crawl.getId();
......@@ -84,7 +85,7 @@ public class OaiPmhCrawlerImpl extends OaiPmhHarvestingService implements Crawle
}
String prefix = null;
OaiPmhResponseContainer oaiFormatsResponse = oaiPmhClient.listMetadataFormats(ep.renderAccessUrl(), null);
OaiPmhResponseContainer oaiFormatsResponse = oaiPmhClient.listMetadataFormats(CrawlHelper.renderAccessUrl(ep), null);
if (oaiFormatsResponse!=null && oaiFormatsResponse.getFormats()!=null) {
for (OaiPmhMetadataFormat format : oaiFormatsResponse.getFormats()) {
if (format.getMetadataNamespace().trim().toLowerCase().equals(rootNs)) {
......
package eu.dariah.de.search.crawling.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import de.unibamberg.minf.dme.model.base.Grammar;
import de.unibamberg.minf.dme.model.base.Nonterminal;
import de.unibamberg.minf.dme.model.function.FunctionImpl;
import de.unibamberg.minf.dme.model.grammar.GrammarImpl;
import de.unibamberg.minf.dme.model.mapping.base.MappedConcept;
import de.unibamberg.minf.mapping.model.MappingExecGroup;
import de.unibamberg.minf.mapping.service.MappingExecutionService;
import de.unibamberg.minf.processing.consumption.CollectingResourceConsumptionServiceImpl;
import de.unibamberg.minf.processing.exception.ProcessingConfigException;
import de.unibamberg.minf.processing.model.base.Resource;
import de.unibamberg.minf.processing.model.helper.ResourceHelper;
import de.unibamberg.minf.processing.service.base.BaseResourceProcessingServiceImpl;
import de.unibamberg.minf.processing.service.base.ProcessingService;
import eu.dariah.de.search.config.MainConfigProperties;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.crawling.files.FileDownloader;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
......@@ -20,20 +39,29 @@ import eu.dariah.de.search.service.MappingService;
import lombok.Getter;
import lombok.Setter;
public class RepetitiveFileCrawlerImpl extends FileDownloader implements InitializingBean {
public class RepetitiveFileCrawlerImpl extends FileDownloader implements ApplicationContextAware {
private ApplicationContext appContext;
@Autowired private MainConfigProperties mainConfig;
@Autowired protected DatamodelService datamodelService;
@Autowired private MappingService mappingService;
@Autowired private MappingExecutionService mappingExecutionService;
@Getter @Setter
Map<String, String> fileProcessingServiceMap;
BaseResourceProcessingServiceImpl processingService;
private List<String> handledUrls;
private ExtendedMappingContainer mapping = null;
private Endpoint endpoint;
private ExtendedDatamodelContainer sourceDatamodelContainer;
private ExtendedDatamodelContainer targetDatamodelContainer;
// 1. Use downloader to get first file
// 2. Get a mapping
// 3. Execute the mapping
......@@ -51,7 +79,8 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Initial
}
@Override
public void afterPropertiesSet() throws Exception {
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.appContext = applicationContext;
this.fileName = UUID.randomUUID().toString();
}
......@@ -59,21 +88,24 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Initial
public void init(Endpoint endpoint, Crawl crawl, ExtendedDatamodelContainer sc) {
super.init(endpoint, crawl, sc);
endpoint.getFileType();
endpoint.getAccessModelId();
this.endpoint = endpoint;
processingService = BaseResourceProcessingServiceImpl.class.cast(appContext.getBean(fileProcessingServiceMap.get(endpoint.getFileType())));
targetDatamodelContainer = datamodelService.findById(mainConfig.getDatamodels().getCrawling());
this.handledUrls = new ArrayList<>();
if (mainConfig.getDatamodels().getCrawling()==null) {
logger.warn("No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable");
} else {
if (endpoint.getAccessModelId()!=null) {
logger.info("Dedicated access modell configured: {}", endpoint.getAccessModelId());
mapping = mappingService.getMappingBySourceAndTarget(endpoint.getAccessModelId(), mainConfig.getDatamodels().getCrawling());
sourceDatamodelContainer = datamodelService.findById(endpoint.getAccessModelId());
} else {
logger.info("No dedicated access modell, using datamodel: {}", sc.getModel().getId());
mapping = mappingService.getMappingBySourceAndTarget(sc.getModel().getId(), mainConfig.getDatamodels().getCrawling());
sourceDatamodelContainer = sc;
}
mapping = mappingService.getMappingBySourceAndTarget(sourceDatamodelContainer.getModel().getId(), mainConfig.getDatamodels().getCrawling());
if (mapping==null) {
logger.info("No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured");
}
......@@ -89,15 +121,96 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Initial
logger.debug("file exists: {}", f.getAbsolutePath());
}
CollectingResourceConsumptionServiceImpl collector = new CollectingResourceConsumptionServiceImpl();
try {
processingService.setSchema(sourceDatamodelContainer.getModel());
processingService.setRoot((Nonterminal)sourceDatamodelContainer.getOrRenderElementHierarchy());
processingService.setInputStream(new FileInputStream(f));
processingService.init();
processingService.addConsumptionService(collector);
processingService.run();
} catch (ProcessingConfigException | FileNotFoundException e) {
logger.error("Exception while processing", e);
}
@Override
protected String getOutputFilename() {
return fileName + "." + (handledUrls==null ? 0 : handledUrls.size());
logger.info("Resources counted:" + collector.getResources().size());
MappingExecGroup mex = this.buildMappingExecutionGroup(mapping, targetDatamodelContainer);
CollectingResourceConsumptionServiceImpl collector2 = new CollectingResourceConsumptionServiceImpl();
mappingExecutionService.addConsumptionService(collector2);
if (mex.getConcepts()!=null || mex.getConcepts().size()>0) {
try {
mappingExecutionService.init(mex, collector.getResources());
mappingExecutionService.run();
} catch (ProcessingConfigException e) {
logger.error("Failed to initialize MappingExecutionService", e);
}
}
logger.info("Trans resources counted:" + collector2.getResources().size());
if (collector2.getResources()!=null) {
for (Resource r : collector2.getResources()) {
List<Resource> res = ResourceHelper.findRecursive(r, "GET.Param");
String newUrl = CrawlHelper.renderAccessUrl(this.endpoint.getUrl(), this.endpoint.getParams(), res);
// TODO: Next download from here...
logger.debug(newUrl);
}
}
}
public MappingExecGroup buildMappingExecutionGroup(ExtendedMappingContainer mc, ExtendedDatamodelContainer scTarget) {
if (mc==null) {
return null;
}
MappingExecGroup exec = new MappingExecGroup();
exec.setMapping(mc.getMapping());
exec.setTargetSchemaId(mc.getMapping().getTargetId());
exec.setTargetElementTree(scTarget.getOrRenderElementHierarchy());
for (MappedConcept c : mc.getMapping().getConcepts()) {
if (c==null) {
continue;
}
if (c.getElementGrammarIdsMap()!=null) {
for (String elementid : c.getElementGrammarIdsMap().keySet()) {
String grammarId = c.getElementGrammarIdsMap().get(elementid);
Grammar g;
if (c.getElementGrammarIdsMap().get(elementid)!=null && mc.getGrammars()!=null &&
mc.getGrammars().containsKey(c.getElementGrammarIdsMap().get(elementid))) {
g = mc.getGrammars().get(c.getElementGrammarIdsMap().get(elementid));
} else {
g = new GrammarImpl(mc.getMapping().getId(), grammarId);
g.setId(grammarId);
g.setPassthrough(true);
}
exec.addGrammar(g);
}
}
FunctionImpl f = new FunctionImpl(mc.getMapping().getId(), c.getFunctionId());
if (mc.getFunctions().containsKey(c.getFunctionId())) {
f.setFunction(mc.getFunctions().get(c.getFunctionId()));
}
exec.addMappedConcept(c, f);
}
return exec;
}
@Override
protected String getOutputFilename() {
return fileName + "." + (handledUrls==null ? 0 : handledUrls.size());
}
}
\ No newline at end of file
......@@ -10,11 +10,21 @@ import java.net.URISyntaxException;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.message.BasicNameValuePair;
import de.unibamberg.minf.processing.exception.ResourceProcessingException;
import de.unibamberg.minf.processing.model.base.Resource;
import de.unibamberg.minf.processing.model.helper.ResourceHelper;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.crawling.crawler.Crawler;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.EndpointParam;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
public class FileDownloader extends BaseFileStreamCrawler implements Crawler {
......@@ -66,9 +76,7 @@ public class FileDownloader extends BaseFileStreamCrawler implements Crawler {
super.init(endpoint, crawl, sc);
try {
this.setupPaths(crawl);
this.inputURI = new URL(endpoint.renderAccessUrl()).toURI();
this.inputURI = new URL(CrawlHelper.renderAccessUrl(endpoint.getUrl(), endpoint.getParams(), null)).toURI();
this.initialized = true;
} catch (MalformedURLException | URISyntaxException e) {
logger.error("File downloader initialization failed: " + e.getMessage(), e);
......@@ -84,7 +92,6 @@ public class FileDownloader extends BaseFileStreamCrawler implements Crawler {
this.downloadFile();
}
public void downloadFile() {
if (this.getInputURI() == null) {
logger.error("Either download URL or fileName was not specified or both.");
......
package eu.dariah.de.search.model;
import java.net.URISyntaxException;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.http.client.utils.URIBuilder;
import org.springframework.data.annotation.Transient;
import de.unibamberg.minf.dme.model.base.Identifiable;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
public class Endpoint implements Identifiable {
private static final long serialVersionUID = 4977814286153993508L;
......@@ -41,26 +37,6 @@ public class Endpoint implements Identifiable {
private boolean unaccessible;
@Transient private boolean deleted;
public String renderAccessUrl() {
if (this.getUrl()==null) {
return null;
}
try {
URIBuilder b = new URIBuilder(this.getUrl());
if (params!=null) {
for (EndpointParam p : params) {
b.addParameter(p.getParam(), p.getValue());
}
}
return b.build().toString();
} catch (URISyntaxException e) {
log.error("Failed to build URL", e);
}
return null;
}
public List<EndpointParam> getParams(String param) {
return params==null ? null : this.params.stream()
.filter(p -> p.getParam().equals(param))
......
......@@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.JsonNode;
import de.unibamberg.minf.dme.model.base.Element;
import de.unibamberg.minf.dme.model.base.Nonterminal;
import de.unibamberg.minf.processing.service.xml.XmlProcessingService;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.es.service.IndexingService;
import eu.dariah.de.search.indexing.model.ResourceContainer;
import eu.dariah.de.search.model.Collection;
......@@ -109,7 +110,7 @@ public class SruQueryExecutionServiceImpl extends BaseResultService implements I
processingSvc.setRoot((Nonterminal)r);
processingSvc.init();
URIBuilder b = new URIBuilder(sruEntry.getValue().renderAccessUrl());
URIBuilder b = new URIBuilder(CrawlHelper.renderAccessUrl(sruEntry.getValue()));
b.addParameter("query", ((SimpleQuery)q).getQueryString());
b.addParameter("recordSchema", alias);
b.addParameter("maximumRecords", "10");
......
Subproject commit 85b67ab8ee09fdac92ab1a5baa3d42544bf73608
Subproject commit 3d405aef5a96c8c67ae0bbccebd3a1891ac38081
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment