Commit 00412c53 authored by Gradl, Tobias's avatar Gradl, Tobias
Browse files

420: Implement configurable crawls -> [GS: Repetitive Crawl Model]

(OPENED)

Task-Url: #420
parent 79cdd252
Pipeline #23358 failed with stage
in 9 seconds
...@@ -22,7 +22,7 @@ allprojects { ...@@ -22,7 +22,7 @@ allprojects {
gtfVersion = "2.0.0-SNAPSHOT" gtfVersion = "2.0.0-SNAPSHOT"
processingVersion = "4.1.0-SNAPSHOT" processingVersion = "4.1.0-SNAPSHOT"
colregModelVersion = "4.3.4-RELEASE" colregModelVersion = "4.3.4-RELEASE"
dariahSpVersion = "2.1.4-SNAPSHOT" dariahSpVersion = "2.1.6-RELEASE"
jsonAssertVersion = "1.5.0" jsonAssertVersion = "1.5.0"
jodaTimeVersion = "2.10.10" jodaTimeVersion = "2.10.10"
......
...@@ -101,6 +101,7 @@ public abstract class BaseSyncService<TModel extends Identifiable, TApi extends ...@@ -101,6 +101,7 @@ public abstract class BaseSyncService<TModel extends Identifiable, TApi extends
@Override @Override
public void destroy() throws Exception { public void destroy() throws Exception {
if (syncExecutor!=null) {
try { try {
syncExecutor.shutdown(); syncExecutor.shutdown();
// Wait until all threads are finished // Wait until all threads are finished
...@@ -109,6 +110,7 @@ public abstract class BaseSyncService<TModel extends Identifiable, TApi extends ...@@ -109,6 +110,7 @@ public abstract class BaseSyncService<TModel extends Identifiable, TApi extends
logger.error("Error closing sync executor", e); logger.error("Error closing sync executor", e);
} }
} }
}
@Override @Override
public Boolean callAsync() { public Boolean callAsync() {
......
...@@ -20,21 +20,17 @@ public class CrawlHelper { ...@@ -20,21 +20,17 @@ public class CrawlHelper {
private CrawlHelper() {} private CrawlHelper() {}
public static String renderAccessUrl(Endpoint ep) { public static String renderAccessUrl(Endpoint ep) {
return renderAccessUrl(ep.getUrl(), ep.getParams(), null); return renderAccessUrl(ep.getUrl(), ep.getParams(), null, null);
} }
public static String renderAccessUrl(String url, List<EndpointParam> endpointParams, List<Resource> dynamicParams) { public static String renderAccessUrl(String url, List<EndpointParam> endpointParams, List<Resource> dynamicParams, List<Resource> removeParams) {
if (url==null) { if (url==null) {
return null; return null;
} }
try { try {
URIBuilder b = new URIBuilder(url); URIBuilder b = new URIBuilder(url);
if (endpointParams!=null) { b.addParameters(filterNameValuePairs(endpointParams, removeParams));
for (EndpointParam p : endpointParams) { b.addParameters(createNameValuePairs(dynamicParams, removeParams));
b.addParameter(p.getParam(), p.getValue());
}
}
b.addParameters(createNameValuePairs(dynamicParams));
return b.build().toString(); return b.build().toString();
} catch (URISyntaxException e) { } catch (URISyntaxException e) {
log.error("Failed to build URL", e); log.error("Failed to build URL", e);
...@@ -42,12 +38,43 @@ public class CrawlHelper { ...@@ -42,12 +38,43 @@ public class CrawlHelper {
return null; return null;
} }
private static List<NameValuePair> createNameValuePairs(List<Resource> dynamicParams) { private static List<NameValuePair> filterNameValuePairs(List<EndpointParam> endpointParams, List<Resource> removeParams) {
if (endpointParams==null) {
return new ArrayList<>();
}
List<NameValuePair> pairs = new ArrayList<>();
// Both structures are allowed: RemoveParam with value & RemoveParam with child 'name'
List<String> removeParamNames = new ArrayList<>();
if (removeParams!=null) {
for (Resource r : removeParams) {
if (r.getValue()!=null) {
removeParamNames.add(r.getValue().toString());
}
if (r.getChildResources()!=null && !r.getChildResources().isEmpty()) {
for (Resource name : ResourceHelper.findRecursive(r, "Name")) {
if (name.getValue()!=null) {
removeParamNames.add(name.getValue().toString());
}
}
}
}
}
for (EndpointParam p : endpointParams) {
if (!removeParamNames.contains(p.getParam())) {
pairs.add(new BasicNameValuePair(p.getParam(), p.getValue()));
}
}
return pairs;
}
private static List<NameValuePair> createNameValuePairs(List<Resource> dynamicParams, List<Resource> removeParams) {
if (dynamicParams==null) { if (dynamicParams==null) {
return new ArrayList<>(); return new ArrayList<>();
} }
List<NameValuePair> pairs = new ArrayList<>(); List<NameValuePair> pairs = new ArrayList<>();
List<Resource> names; List<Resource> names = null;
List<Resource> values; List<Resource> values;
for (Resource r : dynamicParams) { for (Resource r : dynamicParams) {
names = ResourceHelper.findRecursive(r, "Name"); names = ResourceHelper.findRecursive(r, "Name");
......
...@@ -75,8 +75,6 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -75,8 +75,6 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
@Override @Override
public void afterPropertiesSet() throws Exception { public void afterPropertiesSet() throws Exception {
pipelineExecutor = Executors.newFixedThreadPool(this.getMaxPoolSize()); pipelineExecutor = Executors.newFixedThreadPool(this.getMaxPoolSize());
logger.info("Initializing CrawlManager...checking registered crawlers and processors for accessiblity");
//this.checkProcessingClasses();
logger.info("CrawlManager initialized"); logger.info("CrawlManager initialized");
} }
...@@ -219,25 +217,25 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -219,25 +217,25 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
String access = null; String access = null;
String file = null; String file = null;
for (AccessMethods mAv : AccessMethods.values()) { for (AccessMethods mAv : AccessMethods.values()) {
if (mAv.toString().toLowerCase().equals(ep.getAccessType().toLowerCase())) { if (mAv.toString().equalsIgnoreCase(ep.getAccessType())) {
access = mAv.toString(); access = mAv.toString();
break; break;
} }
} }
for (FileTypes ftv : FileTypes.values()) { for (FileTypes ftv : FileTypes.values()) {
if (ftv.toString().toLowerCase().equals(ep.getFileType().toLowerCase())) { if (ftv.toString().equalsIgnoreCase(ep.getFileType())) {
file = ftv.toString(); file = ftv.toString();
break; break;
} }
} }
// Online but no access type detected // Online but no access type detected
if (access==null && c.getBaseCrawlId()==null) { if (access==null && c.getBaseCrawlId()==null) {
logger.error(String.format("Unknown access type [%s]; cancelling crawl", ep.getAccessType())); logger.error("Unknown access type [{}]; cancelling crawl", ep.getAccessType());
this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR); this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR);
return null; return null;
} }
if (file==null) { if (file==null) {
logger.error(String.format("Unknown file type method [%s]; cancelling crawl", ep.getFileType())); logger.error("Unknown file type method [{}]; cancelling crawl", ep.getFileType());
this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR); this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR);
return null; return null;
} }
...@@ -359,8 +357,8 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -359,8 +357,8 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
private List<Crawler> getCrawlers(String method, Map<String, String> processingChain) throws ProcessingConfigException { private List<Crawler> getCrawlers(String method, Map<String, String> processingChain) throws ProcessingConfigException {
if (!processingChain.containsKey(method)) { if (!processingChain.containsKey(method)) {
logger.error(String.format("No processing service implemented/configured for method [%s]", method.toString())); logger.error("No processing service implemented/configured for method [{}]", method);
throw new ProcessingConfigException(String.format("No processing service implemented/configured for method [%s]", method.toString())); throw new ProcessingConfigException(String.format("No processing service implemented/configured for method [%s]", method));
} }
try { try {
String[] serviceRefs = processingChain.get(method).split(","); String[] serviceRefs = processingChain.get(method).split(",");
...@@ -375,27 +373,4 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -375,27 +373,4 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
} }
} }
/*private void checkProcessingClasses() {
if (this.onlineProcessingChains!=null) {
this.checkProcessingClasses(this.onlineProcessingChains);
}
if (this.offlineProcessingChains!=null) {
this.checkProcessingClasses(this.offlineProcessingChains);
}
}
private void checkProcessingClasses(Map<String, String> processingChains) {
for (String am : processingChains.keySet()) {
String[] serviceRefs = processingChains.get(am).split(",");
Class<?> crawlerClass;
for (int i=0; i<serviceRefs.length; i++) {
try {
crawlerClass = Class.forName(serviceRefs[i].trim());
Crawler.class.cast(appContext.getBean(crawlerClass));
} catch (Exception e) {
logger.warn(String.format("Could not access class or bean [%s] as configured for processing. This will result in errors when attempting to crawl data.", serviceRefs[i].trim()), e);
}
}
}
}*/
} }
...@@ -6,9 +6,11 @@ import java.net.MalformedURLException; ...@@ -6,9 +6,11 @@ import java.net.MalformedURLException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.net.URL; import java.net.URL;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue; import java.util.Queue;
import java.util.UUID; import java.util.UUID;
...@@ -44,42 +46,43 @@ import lombok.Setter; ...@@ -44,42 +46,43 @@ import lombok.Setter;
public class RepetitiveFileCrawlerImpl extends FileDownloader implements ApplicationContextAware { public class RepetitiveFileCrawlerImpl extends FileDownloader implements ApplicationContextAware {
private ApplicationContext appContext; private ApplicationContext appContext;
@Autowired private MainConfigProperties mainConfig; @Autowired private MainConfigProperties mainConfig;
@Autowired protected DatamodelService datamodelService; @Autowired protected DatamodelService datamodelService;
@Autowired private MappingService mappingService; @Autowired private MappingService mappingService;
@Autowired private MappingExecutionService mappingExecutionService; @Autowired private MappingExecutionService mappingExecutionService;
@Getter @Setter @Getter @Setter private Map<String, String> fileProcessingServiceMap;
Map<String, String> fileProcessingServiceMap;
@Getter @Setter private int politenessTimespan = 1000; // 1 call per second (to same endpoint) @Getter @Setter private int politenessTimespan = 1000; // 1 call per second (to same endpoint)
BaseResourceProcessingServiceImpl processingService;
CollectingResourceConsumptionServiceImpl sourceResCollector = null;
MappingExecGroup mExecGroup;
CollectingResourceConsumptionServiceImpl targetResCollector = null;
private ExtendedMappingContainer mapping;
private Endpoint endpoint; private Endpoint endpoint;
private Crawl crawl; private Crawl crawl;
private ExtendedDatamodelContainer sourceDatamodelContainer; private ExtendedDatamodelContainer sourceDatamodel;
private ExtendedDatamodelContainer targetDatamodelContainer; private ExtendedDatamodelContainer targetDatamodel;
private ExtendedMappingContainer mapping;
private BaseResourceProcessingServiceImpl processingService;
private MappingExecGroup mExecGroup;
private CollectingResourceConsumptionServiceImpl sourceResCollector = null;
private CollectingResourceConsumptionServiceImpl targetResCollector = null;
List<String> processedUrls; private List<String> processedUrls;
Queue<String> downloadUris; private Queue<String> downloadUris;
Stopwatch swPoliteness = new Stopwatch(); Stopwatch swPoliteness = new Stopwatch();
@Override
public String getOutputFilename() {
return super.getOutputFilename() + "." + (processedUrls==null ? 0 : processedUrls.size());
}
@Override @Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.appContext = applicationContext; this.appContext = applicationContext;
this.fileName = UUID.randomUUID().toString(); this.outputFilename = UUID.randomUUID().toString();
} }
@Override @Override
...@@ -94,25 +97,67 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica ...@@ -94,25 +97,67 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
return; return;
} }
this.initServices(); this.initServices();
if (processingService==null || mExecGroup==null) {
return;
}
this.processedUrls = new ArrayList<>(); this.processedUrls = new ArrayList<>();
this.downloadUris = new LinkedList<>(); this.downloadUris = new LinkedList<>();
} }
@Override @Override
public void downloadFile() { public void download() {
swPoliteness.start(); swPoliteness.start();
super.downloadFile(); super.download();
if (this.mExecGroup==null) { if (this.mExecGroup==null) {
logger.debug("Resumptive crawling not applicable -> crawl done"); logger.debug("Resumptive crawling not applicable -> crawl done");
return; this.registerFinished();
} } else {
this.reset(); this.reset();
this.processDownloadedFile(); this.processDownloadedFile();
this.collectDownloadUris(); this.collectDownloadUris();
this.setupAndDownloadNextFile(); this.setupAndDownloadNextFile();
} }
}
long currentSize = 0;
long overallSize = 0;
protected void registerFinished() {
if (this.getListener() != null) {
this.getListener().finished(this.getUuid());
}
}
protected void registerError() {
if (this.getListener() != null) {
this.getListener().error(this.getUuid());
}
}
@Override
protected void updateFileSize(long size) {
currentSize = size;
overallSize += size;
if (this.getListener() != null) {
this.getListener().updateSize(this.getUuid(), overallSize);
}
}
@Override
protected void updateFileProcessed(long position) {
if (this.getListener() != null) {
this.getListener().processed(this.getUuid(), overallSize - currentSize + position);
}
}
@Override
protected void registerFileFinished() {
currentSize = 0;
}
@Override
protected void registerFileError() {}
private void initCrawlModels(ExtendedDatamodelContainer sc) { private void initCrawlModels(ExtendedDatamodelContainer sc) {
...@@ -127,25 +172,25 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica ...@@ -127,25 +172,25 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
// Dedicated access model or datamodel? // Dedicated access model or datamodel?
if (endpoint.getAccessModelId()!=null) { if (endpoint.getAccessModelId()!=null) {
logger.debug("Dedicated access modell configured: {}", endpoint.getAccessModelId()); logger.debug("Dedicated access modell configured: {}", endpoint.getAccessModelId());
sourceDatamodelContainer = datamodelService.findById(endpoint.getAccessModelId()); sourceDatamodel = datamodelService.findById(endpoint.getAccessModelId());
if (sourceDatamodelContainer==null) { if (sourceDatamodel==null) {
logger.warn("Dedicated access modell configured but not available (Sync with DME required?)"); logger.warn("Dedicated access modell configured but not available (Sync with DME required?)");
return; return;
} }
} else { } else {
logger.debug("No dedicated access modell, using datamodel: {}", sc.getModel().getId()); logger.debug("No dedicated access modell, using datamodel: {}", sc.getModel().getId());
sourceDatamodelContainer = sc; sourceDatamodel = sc;
} }
// Crawl model // Crawl model
targetDatamodelContainer = datamodelService.findById(mainConfig.getDatamodels().getCrawling()); targetDatamodel = datamodelService.findById(mainConfig.getDatamodels().getCrawling());
if (targetDatamodelContainer==null) { if (targetDatamodel==null) {
logger.warn("Crawl modell configured but not available (Sync with DME required?)"); logger.warn("Crawl modell configured but not available (Sync with DME required?)");
return; return;
} }
// Mapping between source (access or data) model and target crawl model // Mapping between source (access or data) model and target crawl model
mapping = mappingService.getMappingBySourceAndTarget(sourceDatamodelContainer.getModel().getId(), mainConfig.getDatamodels().getCrawling()); mapping = mappingService.getMappingBySourceAndTarget(sourceDatamodel.getModel().getId(), mainConfig.getDatamodels().getCrawling());
if (mapping==null) { if (mapping==null) {
logger.info("No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured"); logger.info("No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured");
} }
...@@ -161,8 +206,8 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica ...@@ -161,8 +206,8 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
} }
try { try {
processingService = BaseResourceProcessingServiceImpl.class.cast(appContext.getBean(fileProcessingServiceMap.get(endpoint.getFileType()))); processingService = BaseResourceProcessingServiceImpl.class.cast(appContext.getBean(fileProcessingServiceMap.get(endpoint.getFileType())));
processingService.setSchema(sourceDatamodelContainer.getModel()); processingService.setSchema(sourceDatamodel.getModel());
processingService.setRoot((Nonterminal)sourceDatamodelContainer.getOrRenderElementHierarchy()); processingService.setRoot((Nonterminal)sourceDatamodel.getOrRenderElementHierarchy());
sourceResCollector = new CollectingResourceConsumptionServiceImpl(); sourceResCollector = new CollectingResourceConsumptionServiceImpl();
processingService.addConsumptionService(sourceResCollector); processingService.addConsumptionService(sourceResCollector);
...@@ -173,10 +218,10 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica ...@@ -173,10 +218,10 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
} }
// Mapping execution // Mapping execution
mExecGroup = this.buildMappingExecutionGroup(mapping, targetDatamodelContainer); mExecGroup = this.buildMappingExecutionGroup(mapping, targetDatamodel);
targetResCollector = new CollectingResourceConsumptionServiceImpl(); targetResCollector = new CollectingResourceConsumptionServiceImpl();
mappingExecutionService.addConsumptionService(targetResCollector); mappingExecutionService.addConsumptionService(targetResCollector);
if (mExecGroup.getConcepts()==null || mExecGroup.getConcepts().isEmpty()) { if (mExecGroup!=null && (mExecGroup.getConcepts()==null || mExecGroup.getConcepts().isEmpty())) {
mExecGroup = null; mExecGroup = null;
} }
} }
...@@ -194,21 +239,8 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica ...@@ -194,21 +239,8 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
if (c==null) { if (c==null) {
continue; continue;
} }
if (c.getElementGrammarIdsMap()!=null) { exec.setGrammarsMap(this.collectGrammars(mc.getId(), c.getElementGrammarIdsMap(), mc.getGrammars()));
for (String elementid : c.getElementGrammarIdsMap().keySet()) {
String grammarId = c.getElementGrammarIdsMap().get(elementid);
Grammar g;
if (c.getElementGrammarIdsMap().get(elementid)!=null && mc.getGrammars()!=null &&
mc.getGrammars().containsKey(c.getElementGrammarIdsMap().get(elementid))) {
g = mc.getGrammars().get(c.getElementGrammarIdsMap().get(elementid));
} else {
g = new GrammarImpl(mc.getMapping().getId(), grammarId);
g.setId(grammarId);
g.setPassthrough(true);
}
exec.addGrammar(g);
}
}
FunctionImpl f = new FunctionImpl(mc.getMapping().getId(), c.getFunctionId()); FunctionImpl f = new FunctionImpl(mc.getMapping().getId(), c.getFunctionId());
if (mc.getFunctions().containsKey(c.getFunctionId())) { if (mc.getFunctions().containsKey(c.getFunctionId())) {
f.setFunction(mc.getFunctions().get(c.getFunctionId())); f.setFunction(mc.getFunctions().get(c.getFunctionId()));
...@@ -218,6 +250,26 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica ...@@ -218,6 +250,26 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
return exec; return exec;
} }
private Map<String, Grammar> collectGrammars(String mappingId, Map<String, String> elementGrammarIdsMap, Map<String, Grammar> grammarIdMap) {
if (elementGrammarIdsMap==null) {
return null;
}
Map<String, Grammar> result = new HashMap<>();
Grammar g;
for (Entry<String, String> e : elementGrammarIdsMap.entrySet()) {
if (elementGrammarIdsMap.get(e.getKey())!=null && grammarIdMap!=null &&
grammarIdMap.containsKey(elementGrammarIdsMap.get(e.getKey()))) {
g = grammarIdMap.get(elementGrammarIdsMap.get(e.getKey()));
} else {
g = new GrammarImpl(mappingId, e.getValue());
g.setId(e.getValue());
g.setPassthrough(true);
}
result.put(g.getId(), g);
}
return result;
}
private void reset() { private void reset() {
if (sourceResCollector != null) { if (sourceResCollector != null) {
sourceResCollector.setResources(null); sourceResCollector.setResources(null);
...@@ -251,12 +303,14 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica ...@@ -251,12 +303,14 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
} }</