Commit 976f6274 authored by Gradl, Tobias's avatar Gradl, Tobias
Browse files

Merge branch 'git' into 'master'

Git

See merge request search-commons!10
parents 021dbe7c 8135e574
...@@ -20,11 +20,12 @@ stages: ...@@ -20,11 +20,12 @@ stages:
build: build:
stage: build stage: build
script: ./gradlew assemble script: ./gradlew assemble
only:
- master
deploy: deploy:
stage: deploy stage: deploy
script: script:
- ./gradlew publish -x test $NEXUS_CREDENTIALS - ./gradlew publish -x test $NEXUS_CREDENTIALS
only: only:
- master - master
- sru_opac_mww \ No newline at end of file
...@@ -41,6 +41,7 @@ ext { ...@@ -41,6 +41,7 @@ ext {
httpComponentsVersion = "4.5.5" httpComponentsVersion = "4.5.5"
elasticsearchVersion = "7.3.0" elasticsearchVersion = "7.3.0"
logbackVersion = "1.1.3" logbackVersion = "1.1.3"
lombokVersion = "1.18.12"
} }
dependencies { dependencies {
...@@ -88,6 +89,10 @@ dependencies { ...@@ -88,6 +89,10 @@ dependencies {
testImplementation "ch.qos.logback:logback-core:$logbackVersion" testImplementation "ch.qos.logback:logback-core:$logbackVersion"
testImplementation "ch.qos.logback:logback-classic:$logbackVersion" testImplementation "ch.qos.logback:logback-classic:$logbackVersion"
compileOnly "javax.servlet:servlet-api:2.5" compileOnly "javax.servlet:servlet-api:2.5"
compileOnly "org.projectlombok:lombok:$lombokVersion"
annotationProcessor "org.projectlombok:lombok:$lombokVersion"
testCompileOnly "org.projectlombok:lombok:$lombokVersion"
} }
java { java {
......
...@@ -70,6 +70,7 @@ public class Constants { ...@@ -70,6 +70,7 @@ public class Constants {
public enum AccessMethods { public enum AccessMethods {
OAI_PMH ("OAI-PMH"), OAI_PMH ("OAI-PMH"),
FILE ("Online file"), FILE ("Online file"),
GIT ("Git Repository"),
OPAC ("OPAC"); OPAC ("OPAC");
private final String colregName; private final String colregName;
......
...@@ -331,17 +331,20 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E ...@@ -331,17 +331,20 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
} }
Endpoint e = new Endpoint(); Endpoint e = new Endpoint();
if (accessPojo.getType().equals(AccessMethods.FILE.toString())) {
if (accessPojo.getSubtype()==null) { e.setAccessType(accessPojo.getType());
e.setMethod("XML"); e.setFileType(accessPojo.getSubtype());
} else {
e.setMethod(accessPojo.getSubtype().toUpperCase()); if (accessPojo.getType().equals(AccessMethods.FILE.toString()) ||
} accessPojo.getType().equals(AccessMethods.OAI_PMH.toString()) ||
} else { accessPojo.getType().equals(AccessMethods.OPAC.toString())) {
e.setMethod(accessPojo.getType()); e.setFileType("XML");
} else if (accessPojo.getType().equals(AccessMethods.GIT.toString())) {
e.setFileType("Text");
} }
e.setSet(accessPojo.getSet()); e.setSet(accessPojo.getSet());
e.setUrl(accessPojo.getUri()); e.setUrl(accessPojo.getUri());
e.setPatterns(accessPojo.getPatterns());
e.setDatasets(new ArrayList<Dataset>()); e.setDatasets(new ArrayList<Dataset>());
if (accessPojo.getDatamodels()!=null) { if (accessPojo.getDatamodels()!=null) {
...@@ -380,9 +383,14 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E ...@@ -380,9 +383,14 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
} }
private boolean endpointsAreSame(Endpoint ep1, Endpoint ep2) { private boolean endpointsAreSame(Endpoint ep1, Endpoint ep2) {
return ep1.getMethod().equals(ep2.getMethod()) && try {
ep1.getUrl().equals(ep2.getUrl()) && return ep1.getAccessType().equals(ep2.getAccessType()) &&
(ep1.getSet()==null && ep2.getSet()==null || ep1.getSet().equals(ep2.getSet()) ); ep1.getFileType().equals(ep2.getFileType()) &&
ep1.getUrl().equals(ep2.getUrl()) &&
(ep1.getSet()==null && ep2.getSet()==null || ep1.getSet().equals(ep2.getSet()) );
} catch (Exception e) {
return false;
}
} }
private boolean datasetsAreSame(Dataset ds1, Dataset ds2) { private boolean datasetsAreSame(Dataset ds1, Dataset ds2) {
......
...@@ -195,8 +195,9 @@ public class CollectionEditorController extends BaseController { ...@@ -195,8 +195,9 @@ public class CollectionEditorController extends BaseController {
for (EndpointPojo ep : c.getEndpoints()) { for (EndpointPojo ep : c.getEndpoints()) {
for (DatasetPojo ds : ep.getDatasetPojos()) { for (DatasetPojo ds : ep.getDatasetPojos()) {
esc = datamodelService.findById(ds.getId()); esc = datamodelService.findById(ds.getId());
if (esc!=null) {
ds.setDocs(datamodelService.getDocumentCount(esc.getIndexName(), ep.getId())); ds.setDocs(datamodelService.getDocumentCount(esc.getIndexName(), ep.getId()));
}
if (dmId==null || ds.getId().equals(dmId)) { if (dmId==null || ds.getId().equals(dmId)) {
dmId = ds.getId(); dmId = ds.getId();
model.addAttribute("selectedDsId", ds.getId()); model.addAttribute("selectedDsId", ds.getId());
......
...@@ -13,6 +13,7 @@ public interface CrawlManager extends ProcessingListener { ...@@ -13,6 +13,7 @@ public interface CrawlManager extends ProcessingListener {
public CrawlState getCrawlState(String crawlId); public CrawlState getCrawlState(String crawlId);
public Set<String> getSupportedOnlineAccessMethods(); public Set<String> getSupportedAccessTypes();
public Set<String> getSupportedFileTypes();
public void tryCancelCrawl(String crawlId); public void tryCancelCrawl(String crawlId);
} }
package eu.dariah.de.search.crawling; package eu.dariah.de.search.crawling;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.UUID; import java.util.UUID;
...@@ -33,7 +35,6 @@ import eu.dariah.de.search.model.Dataset; ...@@ -33,7 +35,6 @@ import eu.dariah.de.search.model.Dataset;
import eu.dariah.de.search.model.Collection; import eu.dariah.de.search.model.Collection;
import eu.dariah.de.search.model.Endpoint; import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer; import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.query.execution.DocumentService;
import eu.dariah.de.search.service.CollectionService; import eu.dariah.de.search.service.CollectionService;
import eu.dariah.de.search.service.CrawlService; import eu.dariah.de.search.service.CrawlService;
import eu.dariah.de.search.service.ResourceIndexingServiceImpl; import eu.dariah.de.search.service.ResourceIndexingServiceImpl;
...@@ -56,19 +57,18 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -56,19 +57,18 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
protected Map<UUID, CrawlPipeline> serviceIdServiceMap = new HashMap<UUID, CrawlPipeline>(); protected Map<UUID, CrawlPipeline> serviceIdServiceMap = new HashMap<UUID, CrawlPipeline>();
private int maxPoolSize; private int maxPoolSize;
private Map<String, String> offlineProcessingChains; private Map<String, String> accessChains;
private Map<String, String> onlineProcessingChains; private Map<String, String> fileProcessingChains;
public int getMaxPoolSize() { return maxPoolSize; } public int getMaxPoolSize() { return maxPoolSize; }
public void setMaxPoolSize(int maxPoolSize) { this.maxPoolSize = maxPoolSize; } public void setMaxPoolSize(int maxPoolSize) { this.maxPoolSize = maxPoolSize; }
public Map<String, String> getOfflineProcessingChains() { return offlineProcessingChains; } public Map<String, String> getAccessChains() { return accessChains; }
public void setOfflineProcessingChains(Map<String, String> offlineProcessingChains) { this.offlineProcessingChains = offlineProcessingChains; } public void setAccessChains(Map<String, String> accessChains) { this.accessChains = accessChains; }
public Map<String, String> getOnlineProcessingChains() { return onlineProcessingChains; } public Map<String, String> getFileProcessingChains() { return fileProcessingChains; }
public void setOnlineProcessingChains(Map<String, String> onlineProcessingChains) { this.onlineProcessingChains = onlineProcessingChains; } public void setFileProcessingChains(Map<String, String> fileProcessingChains) { this.fileProcessingChains = fileProcessingChains; }
@Override @Override
public void afterPropertiesSet() throws Exception { public void afterPropertiesSet() throws Exception {
...@@ -84,8 +84,13 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -84,8 +84,13 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
} }
@Override @Override
public Set<String> getSupportedOnlineAccessMethods() { public Set<String> getSupportedAccessTypes() {
return this.onlineProcessingChains.keySet(); return this.getAccessChains().keySet();
}
@Override
public Set<String> getSupportedFileTypes() {
return this.getFileProcessingChains().keySet();
} }
@Override @Override
...@@ -209,27 +214,34 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -209,27 +214,34 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
} }
private CrawlPipeline createPipeline(Endpoint ep, ExtendedDatamodelContainer sc, Crawl c) throws ProcessingConfigException, GenericProcessingException, IOException { private CrawlPipeline createPipeline(Endpoint ep, ExtendedDatamodelContainer sc, Crawl c) throws ProcessingConfigException, GenericProcessingException, IOException {
String m = null; String access = null;
String file = null;
for (AccessMethods mAv : AccessMethods.values()) { for (AccessMethods mAv : AccessMethods.values()) {
if (mAv.equalsName(ep.getMethod())) { if (mAv.equalsName(ep.getAccessType())) {
m = mAv.toString(); access = mAv.toString();
break; break;
} }
} }
for (FileTypes ftv : FileTypes.values()) { for (FileTypes ftv : FileTypes.values()) {
if (ftv.toString().equals(ep.getMethod())) { if (ftv.toString().equals(ep.getFileType())) {
m = ftv.toString(); file = ftv.toString();
break; break;
} }
} }
if (m==null) { // Online but no access type detected
logger.error(String.format("Unknown access method [%s]; cancelling crawl", ep.getMethod())); if (access==null && c.getBaseCrawlId()==null) {
logger.error(String.format("Unknown access tyüe [%s]; cancelling crawl", ep.getAccessType()));
this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR);
return null;
}
if (file==null) {
logger.error(String.format("Unknown file type method [%s]; cancelling crawl", ep.getFileType()));
this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR); this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR);
return null; return null;
} }
CrawlingExecutionContext ctx = new CrawlingExecutionContext(this.baseDownloadPath, c); CrawlingExecutionContext ctx = new CrawlingExecutionContext(this.baseDownloadPath, c);
Crawler[] crawlers = this.getCrawlers(m, c.getBaseCrawlId()==null); List<Crawler> crawlers = this.getCrawlers(access, file, c.getBaseCrawlId()==null);
ResourceIndexingServiceImpl indexer; ResourceIndexingServiceImpl indexer;
for (Crawler crawler : crawlers) { for (Crawler crawler : crawlers) {
if (crawler instanceof Processor) { if (crawler instanceof Processor) {
...@@ -334,24 +346,25 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware, ...@@ -334,24 +346,25 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
return cId; return cId;
} }
private Crawler[] getCrawlers(String method, boolean online) throws ProcessingConfigException { private List<Crawler> getCrawlers(String accessType, String fileType, boolean online) throws ProcessingConfigException {
List<Crawler> chain = new ArrayList<>();
if (online) { if (online) {
return this.getCrawlers(method, onlineProcessingChains); chain.addAll(this.getCrawlers(accessType, accessChains));
} else {
return this.getCrawlers(method, offlineProcessingChains);
} }
chain.addAll(this.getCrawlers(fileType, fileProcessingChains));
return chain;
} }
private Crawler[] getCrawlers(String method, Map<String, String> processingChain) throws ProcessingConfigException { private List<Crawler> getCrawlers(String method, Map<String, String> processingChain) throws ProcessingConfigException {
if (!processingChain.containsKey(method)) { if (!processingChain.containsKey(method)) {
logger.error(String.format("No processing service implemented/configured for method [%s]", method.toString())); logger.error(String.format("No processing service implemented/configured for method [%s]", method.toString()));
throw new ProcessingConfigException(String.format("No processing service implemented/configured for method [%s]", method.toString())); throw new ProcessingConfigException(String.format("No processing service implemented/configured for method [%s]", method.toString()));
} }
try { try {
String[] serviceRefs = processingChain.get(method).split(","); String[] serviceRefs = processingChain.get(method).split(",");
Crawler[] result = new Crawler[serviceRefs.length]; List<Crawler> result = new ArrayList<>(serviceRefs.length);
for (int i=0; i<serviceRefs.length; i++) { for (int i=0; i<serviceRefs.length; i++) {
result[i] = (Crawler)appContext.getBean(serviceRefs[i].trim()); result.add((Crawler)appContext.getBean(serviceRefs[i].trim()));
} }
return result; return result;
} catch (Exception e) { } catch (Exception e) {
......
package eu.dariah.de.search.crawling; package eu.dariah.de.search.crawling;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List;
import java.util.UUID; import java.util.UUID;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
...@@ -48,8 +49,8 @@ public class CrawlPipelineImpl implements CrawlPipeline { ...@@ -48,8 +49,8 @@ public class CrawlPipelineImpl implements CrawlPipeline {
@Override public boolean isCancellationRequested() { return cancellationRequested; } @Override public boolean isCancellationRequested() { return cancellationRequested; }
public CrawlPipelineImpl(String crawlId, Crawler[] runnables) throws GenericProcessingException { public CrawlPipelineImpl(String crawlId, List<Crawler> runnables) throws GenericProcessingException {
if (runnables==null || runnables.length==0) { if (runnables==null || runnables.isEmpty()) {
throw new GenericProcessingException("Non-empty array of processing services required to instantiate a processing pipeline"); throw new GenericProcessingException("Non-empty array of processing services required to instantiate a processing pipeline");
} }
......
package eu.dariah.de.search.crawling.crawler; package eu.dariah.de.search.crawling.crawler;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.joda.time.Period;
import org.slf4j.MDC; import org.slf4j.MDC;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContextAware;
import de.unibamberg.minf.dme.model.datamodel.NonterminalImpl; import de.unibamberg.minf.dme.model.datamodel.NonterminalImpl;
import de.unibamberg.minf.processing.consumption.ResourceConsumptionService; import de.unibamberg.minf.processing.consumption.ResourceConsumptionService;
import de.unibamberg.minf.processing.exception.ProcessingConfigException; import de.unibamberg.minf.processing.exception.ProcessingConfigException;
import de.unibamberg.minf.processing.service.MatchingFileCollector;
import de.unibamberg.minf.processing.service.ParallelFileProcessingService; import de.unibamberg.minf.processing.service.ParallelFileProcessingService;
import eu.dariah.de.search.model.Crawl; import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint; import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer; import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.service.CrawlService; import eu.dariah.de.search.service.CrawlService;
public class FileProcessor extends ParallelFileProcessingService implements Processor, ResourceConsumptionService { public class FileProcessor extends ParallelFileProcessingService implements Processor, ResourceConsumptionService, InitializingBean {
@Autowired private CrawlService crawlService; @Autowired private CrawlService crawlService;
@Autowired private List<String> antiPatterns;
private boolean initialized = false; private boolean initialized = false;
...@@ -35,6 +47,12 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc ...@@ -35,6 +47,12 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc
return super.isInitialized() && initialized; return super.isInitialized() && initialized;
} }
@SuppressWarnings("unchecked")
@Override
public void afterPropertiesSet() throws Exception {
this.antiPatterns = (List<String>)applicationContext.getBean("antiPatterns");
}
@Override @Override
public void run() { public void run() {
MDC.put("uid", crawlId); MDC.put("uid", crawlId);
...@@ -50,6 +68,7 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc ...@@ -50,6 +68,7 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc
this.setSchema(sc.getModel()); this.setSchema(sc.getModel());
this.crawlId = crawl.getId(); this.crawlId = crawl.getId();
// An original offline crawl // An original offline crawl
if (crawl.getBaseCrawlId()!=null) { if (crawl.getBaseCrawlId()!=null) {
this.setPath(crawlService.getCrawlDirPath(crawl.getBaseCrawlId())); this.setPath(crawlService.getCrawlDirPath(crawl.getBaseCrawlId()));
...@@ -58,12 +77,17 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc ...@@ -58,12 +77,17 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc
else { else {
this.setPath(crawlService.getCrawlDirPath(crawl.getId())); this.setPath(crawlService.getCrawlDirPath(crawl.getId()));
} }
this.initialized = true; this.initialized = true;
try { try {
if (endpoint.getPatterns()!=null) {
this.setFileCollector(new MatchingFileCollector(Paths.get(this.getPath()), endpoint.getPatterns()));
this.getFileCollector().setAntiPatternStrings(antiPatterns);
}
super.init(); super.init();
} catch (ProcessingConfigException e) { } catch (ProcessingConfigException | IOException e) {
logger.error("Failed to initialize XML processing", e); logger.error("Failed to initialize file processing", e);
} }
} }
} }
package eu.dariah.de.search.crawling.crawler;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.slf4j.MDC;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import de.unibamberg.minf.processing.git.adapter.GitRepositoryAdapter;
import de.unibamberg.minf.processing.git.service.GitRepositoryProcessingService;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.service.CrawlService;
public class GitCrawlerImpl extends GitRepositoryProcessingService implements Crawler, ApplicationContextAware {
@Autowired private CrawlService crawlService;
private boolean initialized = false;
private String crawlId;
@Override
public String getUnitMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.git_crawling.unit";
}
@Override
public String getTitleMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.git_crawling.title";
}
@Override
public boolean isInitialized() {
return super.isInitialized() && initialized;
}
@Override
public void run() {
MDC.put("uid", crawlId);
File cDir = new File(this.getCrawlDir());
if (cDir.exists()) {
FileUtils.deleteQuietly(cDir);
}
super.run();
}
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.setAdapter(applicationContext.getBean(GitRepositoryAdapter.class));
}
@Override
public void init(Endpoint endpoint, Crawl crawl, ExtendedDatamodelContainer sc) {
this.setUrl(endpoint.getUrl());
this.setBranch(endpoint.getSet());
this.crawlId = crawl.getId();
this.setCrawlDir(crawlService.getCrawlDirPath(crawl));
this.initialized = true;
}
}
\ No newline at end of file
package eu.dariah.de.search.dao;
import de.unibamberg.minf.dme.model.version.VersionInfo;
import eu.dariah.de.search.dao.base.MongoDao;
public interface VersionDao extends MongoDao<VersionInfo> { }
package eu.dariah.de.search.dao;
import org.springframework.stereotype.Repository;
import de.unibamberg.minf.dme.model.version.VersionInfo;
import eu.dariah.de.search.dao.base.BaseMongoDaoImpl;
@Repository
public class VersionDaoImpl extends BaseMongoDaoImpl<VersionInfo> implements VersionDao {
public VersionDaoImpl() {
super(VersionInfo.class);
}
}
...@@ -13,8 +13,13 @@ public class Endpoint implements Identifiable { ...@@ -13,8 +13,13 @@ public class Endpoint implements Identifiable {
private List<Dataset> datasets; private List<Dataset> datasets;
private List<String> patterns;
private String url; private String url;