Commit 79cdd252 authored by Gradl, Tobias's avatar Gradl, Tobias
Browse files

420: Implement configurable crawls -> [GS: Repetitive Crawl Model]

(OPENED)

Task-Url: #420
parent bd2d28bb
package eu.dariah.de.search.crawling;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.message.BasicNameValuePair;
import de.unibamberg.minf.processing.model.base.Resource;
import de.unibamberg.minf.processing.model.helper.ResourceHelper;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.EndpointParam;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class CrawlHelper {
private CrawlHelper() {}
public static String renderAccessUrl(Endpoint ep) {
return renderAccessUrl(ep.getUrl(), ep.getParams(), null);
}
public static String renderAccessUrl(String url, List<EndpointParam> endpointParams, List<Resource> dynamicParams) {
if (url==null) {
return null;
}
try {
URIBuilder b = new URIBuilder(url);
if (endpointParams!=null) {
for (EndpointParam p : endpointParams) {
b.addParameter(p.getParam(), p.getValue());
}
}
b.addParameters(createNameValuePairs(dynamicParams));
return b.build().toString();
} catch (URISyntaxException e) {
log.error("Failed to build URL", e);
}
return null;
}
private static List<NameValuePair> createNameValuePairs(List<Resource> dynamicParams) {
if (dynamicParams==null) {
return new ArrayList<>();
}
List<NameValuePair> pairs = new ArrayList<>();
List<Resource> names;
List<Resource> values;
for (Resource r : dynamicParams) {
names = ResourceHelper.findRecursive(r, "Name");
values = ResourceHelper.findRecursive(r, "Value");
for (Resource name : names) {
if (name.getValue()==null || name.getValue().toString().trim().isEmpty()) {
continue;
}
for (Resource value : values) {
if (value.getValue()==null || value.getValue().toString().trim().isEmpty()) {
continue;
}
pairs.add(new BasicNameValuePair(name.getValue().toString().trim(), value.getValue().toString().trim()));
}
}
}
return pairs;
}
}
\ No newline at end of file
package eu.dariah.de.search.crawling.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.UUID;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import de.unibamberg.minf.core.util.Stopwatch;
import de.unibamberg.minf.dme.model.base.Grammar;
import de.unibamberg.minf.dme.model.base.Nonterminal;
import de.unibamberg.minf.dme.model.function.FunctionImpl;
......@@ -26,7 +30,6 @@ import de.unibamberg.minf.processing.exception.ProcessingConfigException;
import de.unibamberg.minf.processing.model.base.Resource;
import de.unibamberg.minf.processing.model.helper.ResourceHelper;
import de.unibamberg.minf.processing.service.base.BaseResourceProcessingServiceImpl;
import de.unibamberg.minf.processing.service.base.ProcessingService;
import eu.dariah.de.search.config.MainConfigProperties;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.crawling.files.FileDownloader;
......@@ -51,33 +54,28 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
Map<String, String> fileProcessingServiceMap;
@Getter @Setter private int politenessTimespan = 1000; // 1 call per second (to same endpoint)
BaseResourceProcessingServiceImpl processingService;
CollectingResourceConsumptionServiceImpl sourceResCollector = null;
MappingExecGroup mExecGroup;
CollectingResourceConsumptionServiceImpl targetResCollector = null;
private List<String> handledUrls;
private ExtendedMappingContainer mapping = null;
private ExtendedMappingContainer mapping;
private Endpoint endpoint;
private Crawl crawl;
private ExtendedDatamodelContainer sourceDatamodelContainer;
private ExtendedDatamodelContainer targetDatamodelContainer;
// 1. Use downloader to get first file
// 2. Get a mapping
// 3. Execute the mapping
@Override
public String getUnitMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.file.crawler.unit";
}
@Override
public String getTitleMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.file.crawler.title";
}
List<String> processedUrls;
Queue<String> downloadUris;
Stopwatch swPoliteness = new Stopwatch();
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.appContext = applicationContext;
......@@ -89,96 +87,107 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
super.init(endpoint, crawl, sc);
this.endpoint = endpoint;
this.crawl = crawl;
processingService = BaseResourceProcessingServiceImpl.class.cast(appContext.getBean(fileProcessingServiceMap.get(endpoint.getFileType())));
targetDatamodelContainer = datamodelService.findById(mainConfig.getDatamodels().getCrawling());
this.handledUrls = new ArrayList<>();
if (mainConfig.getDatamodels().getCrawling()==null) {
logger.warn("No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable");
} else {
if (endpoint.getAccessModelId()!=null) {
logger.info("Dedicated access modell configured: {}", endpoint.getAccessModelId());
sourceDatamodelContainer = datamodelService.findById(endpoint.getAccessModelId());
} else {
logger.info("No dedicated access modell, using datamodel: {}", sc.getModel().getId());
sourceDatamodelContainer = sc;
}
mapping = mappingService.getMappingBySourceAndTarget(sourceDatamodelContainer.getModel().getId(), mainConfig.getDatamodels().getCrawling());
if (mapping==null) {
logger.info("No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured");
}
this.initCrawlModels(sc);
if (mapping==null) {
return;
}
this.initServices();
this.processedUrls = new ArrayList<>();
this.downloadUris = new LinkedList<>();
}
@Override
public void downloadFile() {
swPoliteness.start();
super.downloadFile();
File f = new File(this.getOutputPath());
if (f.exists()) {
logger.debug("file exists: {}", f.getAbsolutePath());
if (this.mExecGroup==null) {
logger.debug("Resumptive crawling not applicable -> crawl done");
return;
}
CollectingResourceConsumptionServiceImpl collector = new CollectingResourceConsumptionServiceImpl();
this.reset();
this.processDownloadedFile();
try {
processingService.setSchema(sourceDatamodelContainer.getModel());
processingService.setRoot((Nonterminal)sourceDatamodelContainer.getOrRenderElementHierarchy());
processingService.setInputStream(new FileInputStream(f));
processingService.init();
processingService.addConsumptionService(collector);
processingService.run();
} catch (ProcessingConfigException | FileNotFoundException e) {
logger.error("Exception while processing", e);
}
logger.info("Resources counted:" + collector.getResources().size());
MappingExecGroup mex = this.buildMappingExecutionGroup(mapping, targetDatamodelContainer);
this.collectDownloadUris();
this.setupAndDownloadNextFile();
}
private void initCrawlModels(ExtendedDatamodelContainer sc) {
mapping = null;
CollectingResourceConsumptionServiceImpl collector2 = new CollectingResourceConsumptionServiceImpl();
mappingExecutionService.addConsumptionService(collector2);
// Crawl model available
if (mainConfig.getDatamodels().getCrawling()==null) {
logger.warn("No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable");
return;
}
if (mex.getConcepts()!=null || mex.getConcepts().size()>0) {
try {
mappingExecutionService.init(mex, collector.getResources());
mappingExecutionService.run();
} catch (ProcessingConfigException e) {
logger.error("Failed to initialize MappingExecutionService", e);
// Dedicated access model or datamodel?
if (endpoint.getAccessModelId()!=null) {
logger.debug("Dedicated access modell configured: {}", endpoint.getAccessModelId());
sourceDatamodelContainer = datamodelService.findById(endpoint.getAccessModelId());
if (sourceDatamodelContainer==null) {
logger.warn("Dedicated access modell configured but not available (Sync with DME required?)");
return;
}
} else {
logger.debug("No dedicated access modell, using datamodel: {}", sc.getModel().getId());
sourceDatamodelContainer = sc;
}
logger.info("Trans resources counted:" + collector2.getResources().size());
// Crawl model
targetDatamodelContainer = datamodelService.findById(mainConfig.getDatamodels().getCrawling());
if (targetDatamodelContainer==null) {
logger.warn("Crawl modell configured but not available (Sync with DME required?)");
return;
}
// Mapping between source (access or data) model and target crawl model
mapping = mappingService.getMappingBySourceAndTarget(sourceDatamodelContainer.getModel().getId(), mainConfig.getDatamodels().getCrawling());
if (mapping==null) {
logger.info("No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured");
}
}
private void initServices() {
processingService = null;
if (collector2.getResources()!=null) {
// Setup everything that is needed for file processing
if (!fileProcessingServiceMap.containsKey(endpoint.getFileType())) {
logger.warn("Endpoint file type unsupported by repetitive crawling: {}", endpoint.getFileType());
return;
}
try {
processingService = BaseResourceProcessingServiceImpl.class.cast(appContext.getBean(fileProcessingServiceMap.get(endpoint.getFileType())));
processingService.setSchema(sourceDatamodelContainer.getModel());
processingService.setRoot((Nonterminal)sourceDatamodelContainer.getOrRenderElementHierarchy());
for (Resource r : collector2.getResources()) {
List<Resource> res = ResourceHelper.findRecursive(r, "GET.Param");
String newUrl = CrawlHelper.renderAccessUrl(this.endpoint.getUrl(), this.endpoint.getParams(), res);
// TODO: Next download from here...
logger.debug(newUrl);
}
sourceResCollector = new CollectingResourceConsumptionServiceImpl();
processingService.addConsumptionService(sourceResCollector);
} catch (Exception e) {
logger.warn("No supporting processing service available for file type: {}", endpoint.getFileType());
return;
}
// Mapping execution
mExecGroup = this.buildMappingExecutionGroup(mapping, targetDatamodelContainer);
targetResCollector = new CollectingResourceConsumptionServiceImpl();
mappingExecutionService.addConsumptionService(targetResCollector);
if (mExecGroup.getConcepts()==null || mExecGroup.getConcepts().isEmpty()) {
mExecGroup = null;
}
}
public MappingExecGroup buildMappingExecutionGroup(ExtendedMappingContainer mc, ExtendedDatamodelContainer scTarget) {
private MappingExecGroup buildMappingExecutionGroup(ExtendedMappingContainer mc, ExtendedDatamodelContainer scTarget) {
if (mc==null) {
return null;
}
MappingExecGroup exec = new MappingExecGroup();
exec.setMapping(mc.getMapping());
exec.setTargetSchemaId(mc.getMapping().getTargetId());
exec.setTargetElementTree(scTarget.getOrRenderElementHierarchy());
for (MappedConcept c : mc.getMapping().getConcepts()) {
......@@ -208,9 +217,104 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
}
return exec;
}
private void reset() {
if (sourceResCollector != null) {
sourceResCollector.setResources(null);
}
if (targetResCollector != null) {
targetResCollector.setResources(null);
}
}
private void processDownloadedFile() {
logger.debug("Processing downloaded file: {}", this.getOutputPath());
try {
processingService.setInputStream(new FileInputStream(this.getOutputPath()));
processingService.init();
processingService.run();
} catch (ProcessingConfigException | FileNotFoundException e) {
logger.error("Exception while processing", e);
}
if (sourceResCollector.getResources()!=null && !sourceResCollector.getResources().isEmpty()) {
try {
mappingExecutionService.init(mExecGroup, sourceResCollector.getResources());
mappingExecutionService.run();
} catch (ProcessingConfigException e) {
logger.error("Failed to initialize MappingExecutionService", e);
}
}
logger.debug("Mapping execution transformed {} to {} resources",
sourceResCollector.getResources()==null? 0 : sourceResCollector.getResources().size(),
targetResCollector.getResources()==null? 0 : targetResCollector.getResources().size());
}
private void collectDownloadUris() {
List<Resource> res;
String newUrl;
if (targetResCollector.getResources()!=null) {
for (Resource r : targetResCollector.getResources()) {
res = ResourceHelper.findRecursive(r, "GET.Param");
newUrl = CrawlHelper.renderAccessUrl(this.endpoint.getUrl(), this.endpoint.getParams(), res);
if (!processedUrls.contains(newUrl)) {
processedUrls.add(newUrl);
downloadUris.add(newUrl);
}
}
}
}
private void setupAndDownloadNextFile() {
if (downloadUris.isEmpty()) {
logger.debug("Download queue is empty -> file crawling is complete");
return;
}
try {
this.setupPaths(this.crawl);
} catch (MalformedURLException e) {
logger.error("Failed to setup paths for crawl", e);
return;
}
this.inputURI = null;
try {
String downloadUri = downloadUris.remove();
logger.debug("Processing next downloadURI: {}", downloadUri);
this.inputURI = new URL(downloadUri).toURI();
} catch (MalformedURLException | URISyntaxException e) {
logger.error("Failed to setup and download next file URI", e);
}
// If next inputURI is setup -> download, else continue with next download URI
if (this.inputURI!=null) {
this.continueDownload();
} else {
this.setupAndDownloadNextFile();
}
}
private void continueDownload() {
long sleepTs = this.politenessTimespan - swPoliteness.getElapsedTime();
if (sleepTs > 0) {
if (logger.isDebugEnabled()) {
logger.debug(String.format("Crawl sleeping %sms due to politeness setting (%sms)", sleepTs, this.getPolitenessTimespan()));
}
try {
Thread.sleep(sleepTs);
this.downloadFile();
} catch (InterruptedException e) {
logger.error("Thead.sleep interrupted", e);
Thread.currentThread().interrupt();
}
} else {
this.downloadFile();
}
}
@Override
protected String getOutputFilename() {
return fileName + "." + (handledUrls==null ? 0 : handledUrls.size());
return fileName + "." + (processedUrls==null ? 0 : processedUrls.size());
}
}
\ No newline at end of file
......@@ -10,21 +10,11 @@ import java.net.URISyntaxException;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.message.BasicNameValuePair;
import de.unibamberg.minf.processing.exception.ResourceProcessingException;
import de.unibamberg.minf.processing.model.base.Resource;
import de.unibamberg.minf.processing.model.helper.ResourceHelper;
import eu.dariah.de.search.crawling.CrawlHelper;
import eu.dariah.de.search.crawling.crawler.Crawler;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.EndpointParam;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
public class FileDownloader extends BaseFileStreamCrawler implements Crawler {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment