Commit 976f6274 authored by Gradl, Tobias's avatar Gradl, Tobias
Browse files

Merge branch 'git' into 'master'

Git

See merge request !10
parents 021dbe7c 8135e574
Pipeline #16234 passed with stages
in 5 minutes and 55 seconds
......@@ -20,11 +20,12 @@ stages:
build:
stage: build
script: ./gradlew assemble
only:
- master
deploy:
stage: deploy
script:
- ./gradlew publish -x test $NEXUS_CREDENTIALS
only:
- master
- sru_opac_mww
- master
\ No newline at end of file
......@@ -41,6 +41,7 @@ ext {
httpComponentsVersion = "4.5.5"
elasticsearchVersion = "7.3.0"
logbackVersion = "1.1.3"
lombokVersion = "1.18.12"
}
dependencies {
......@@ -88,6 +89,10 @@ dependencies {
testImplementation "ch.qos.logback:logback-core:$logbackVersion"
testImplementation "ch.qos.logback:logback-classic:$logbackVersion"
compileOnly "javax.servlet:servlet-api:2.5"
compileOnly "org.projectlombok:lombok:$lombokVersion"
annotationProcessor "org.projectlombok:lombok:$lombokVersion"
testCompileOnly "org.projectlombok:lombok:$lombokVersion"
}
java {
......
......@@ -70,6 +70,7 @@ public class Constants {
public enum AccessMethods {
OAI_PMH ("OAI-PMH"),
FILE ("Online file"),
GIT ("Git Repository"),
OPAC ("OPAC");
private final String colregName;
......
......@@ -331,17 +331,20 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
}
Endpoint e = new Endpoint();
if (accessPojo.getType().equals(AccessMethods.FILE.toString())) {
if (accessPojo.getSubtype()==null) {
e.setMethod("XML");
} else {
e.setMethod(accessPojo.getSubtype().toUpperCase());
}
} else {
e.setMethod(accessPojo.getType());
e.setAccessType(accessPojo.getType());
e.setFileType(accessPojo.getSubtype());
if (accessPojo.getType().equals(AccessMethods.FILE.toString()) ||
accessPojo.getType().equals(AccessMethods.OAI_PMH.toString()) ||
accessPojo.getType().equals(AccessMethods.OPAC.toString())) {
e.setFileType("XML");
} else if (accessPojo.getType().equals(AccessMethods.GIT.toString())) {
e.setFileType("Text");
}
e.setSet(accessPojo.getSet());
e.setUrl(accessPojo.getUri());
e.setPatterns(accessPojo.getPatterns());
e.setDatasets(new ArrayList<Dataset>());
if (accessPojo.getDatamodels()!=null) {
......@@ -380,9 +383,14 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
}
private boolean endpointsAreSame(Endpoint ep1, Endpoint ep2) {
return ep1.getMethod().equals(ep2.getMethod()) &&
ep1.getUrl().equals(ep2.getUrl()) &&
(ep1.getSet()==null && ep2.getSet()==null || ep1.getSet().equals(ep2.getSet()) );
try {
return ep1.getAccessType().equals(ep2.getAccessType()) &&
ep1.getFileType().equals(ep2.getFileType()) &&
ep1.getUrl().equals(ep2.getUrl()) &&
(ep1.getSet()==null && ep2.getSet()==null || ep1.getSet().equals(ep2.getSet()) );
} catch (Exception e) {
return false;
}
}
private boolean datasetsAreSame(Dataset ds1, Dataset ds2) {
......
......@@ -195,8 +195,9 @@ public class CollectionEditorController extends BaseController {
for (EndpointPojo ep : c.getEndpoints()) {
for (DatasetPojo ds : ep.getDatasetPojos()) {
esc = datamodelService.findById(ds.getId());
ds.setDocs(datamodelService.getDocumentCount(esc.getIndexName(), ep.getId()));
if (esc!=null) {
ds.setDocs(datamodelService.getDocumentCount(esc.getIndexName(), ep.getId()));
}
if (dmId==null || ds.getId().equals(dmId)) {
dmId = ds.getId();
model.addAttribute("selectedDsId", ds.getId());
......
......@@ -13,6 +13,7 @@ public interface CrawlManager extends ProcessingListener {
public CrawlState getCrawlState(String crawlId);
public Set<String> getSupportedOnlineAccessMethods();
public Set<String> getSupportedAccessTypes();
public Set<String> getSupportedFileTypes();
public void tryCancelCrawl(String crawlId);
}
package eu.dariah.de.search.crawling;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
......@@ -33,7 +35,6 @@ import eu.dariah.de.search.model.Dataset;
import eu.dariah.de.search.model.Collection;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.query.execution.DocumentService;
import eu.dariah.de.search.service.CollectionService;
import eu.dariah.de.search.service.CrawlService;
import eu.dariah.de.search.service.ResourceIndexingServiceImpl;
......@@ -56,19 +57,18 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
protected Map<UUID, CrawlPipeline> serviceIdServiceMap = new HashMap<UUID, CrawlPipeline>();
private int maxPoolSize;
private Map<String, String> offlineProcessingChains;
private Map<String, String> onlineProcessingChains;
private Map<String, String> accessChains;
private Map<String, String> fileProcessingChains;
public int getMaxPoolSize() { return maxPoolSize; }
public void setMaxPoolSize(int maxPoolSize) { this.maxPoolSize = maxPoolSize; }
public Map<String, String> getOfflineProcessingChains() { return offlineProcessingChains; }
public void setOfflineProcessingChains(Map<String, String> offlineProcessingChains) { this.offlineProcessingChains = offlineProcessingChains; }
public Map<String, String> getOnlineProcessingChains() { return onlineProcessingChains; }
public void setOnlineProcessingChains(Map<String, String> onlineProcessingChains) { this.onlineProcessingChains = onlineProcessingChains; }
public Map<String, String> getAccessChains() { return accessChains; }
public void setAccessChains(Map<String, String> accessChains) { this.accessChains = accessChains; }
public Map<String, String> getFileProcessingChains() { return fileProcessingChains; }
public void setFileProcessingChains(Map<String, String> fileProcessingChains) { this.fileProcessingChains = fileProcessingChains; }
@Override
public void afterPropertiesSet() throws Exception {
......@@ -84,8 +84,13 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
}
@Override
public Set<String> getSupportedOnlineAccessMethods() {
return this.onlineProcessingChains.keySet();
public Set<String> getSupportedAccessTypes() {
return this.getAccessChains().keySet();
}
@Override
public Set<String> getSupportedFileTypes() {
return this.getFileProcessingChains().keySet();
}
@Override
......@@ -209,27 +214,34 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
}
private CrawlPipeline createPipeline(Endpoint ep, ExtendedDatamodelContainer sc, Crawl c) throws ProcessingConfigException, GenericProcessingException, IOException {
String m = null;
String access = null;
String file = null;
for (AccessMethods mAv : AccessMethods.values()) {
if (mAv.equalsName(ep.getMethod())) {
m = mAv.toString();
if (mAv.equalsName(ep.getAccessType())) {
access = mAv.toString();
break;
}
}
for (FileTypes ftv : FileTypes.values()) {
if (ftv.toString().equals(ep.getMethod())) {
m = ftv.toString();
if (ftv.toString().equals(ep.getFileType())) {
file = ftv.toString();
break;
}
}
if (m==null) {
logger.error(String.format("Unknown access method [%s]; cancelling crawl", ep.getMethod()));
// Online but no access type detected
if (access==null && c.getBaseCrawlId()==null) {
logger.error(String.format("Unknown access tyüe [%s]; cancelling crawl", ep.getAccessType()));
this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR);
return null;
}
if (file==null) {
logger.error(String.format("Unknown file type method [%s]; cancelling crawl", ep.getFileType()));
this.updateCrawl(c.getId(), ProcessingServiceStates.ERROR);
return null;
}
CrawlingExecutionContext ctx = new CrawlingExecutionContext(this.baseDownloadPath, c);
Crawler[] crawlers = this.getCrawlers(m, c.getBaseCrawlId()==null);
List<Crawler> crawlers = this.getCrawlers(access, file, c.getBaseCrawlId()==null);
ResourceIndexingServiceImpl indexer;
for (Crawler crawler : crawlers) {
if (crawler instanceof Processor) {
......@@ -334,24 +346,25 @@ public class CrawlManagerImpl implements CrawlManager, ApplicationContextAware,
return cId;
}
private Crawler[] getCrawlers(String method, boolean online) throws ProcessingConfigException {
private List<Crawler> getCrawlers(String accessType, String fileType, boolean online) throws ProcessingConfigException {
List<Crawler> chain = new ArrayList<>();
if (online) {
return this.getCrawlers(method, onlineProcessingChains);
} else {
return this.getCrawlers(method, offlineProcessingChains);
chain.addAll(this.getCrawlers(accessType, accessChains));
}
chain.addAll(this.getCrawlers(fileType, fileProcessingChains));
return chain;
}
private Crawler[] getCrawlers(String method, Map<String, String> processingChain) throws ProcessingConfigException {
private List<Crawler> getCrawlers(String method, Map<String, String> processingChain) throws ProcessingConfigException {
if (!processingChain.containsKey(method)) {
logger.error(String.format("No processing service implemented/configured for method [%s]", method.toString()));
throw new ProcessingConfigException(String.format("No processing service implemented/configured for method [%s]", method.toString()));
}
try {
String[] serviceRefs = processingChain.get(method).split(",");
Crawler[] result = new Crawler[serviceRefs.length];
List<Crawler> result = new ArrayList<>(serviceRefs.length);
for (int i=0; i<serviceRefs.length; i++) {
result[i] = (Crawler)appContext.getBean(serviceRefs[i].trim());
result.add((Crawler)appContext.getBean(serviceRefs[i].trim()));
}
return result;
} catch (Exception e) {
......
package eu.dariah.de.search.crawling;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
......@@ -48,8 +49,8 @@ public class CrawlPipelineImpl implements CrawlPipeline {
@Override public boolean isCancellationRequested() { return cancellationRequested; }
public CrawlPipelineImpl(String crawlId, Crawler[] runnables) throws GenericProcessingException {
if (runnables==null || runnables.length==0) {
public CrawlPipelineImpl(String crawlId, List<Crawler> runnables) throws GenericProcessingException {
if (runnables==null || runnables.isEmpty()) {
throw new GenericProcessingException("Non-empty array of processing services required to instantiate a processing pipeline");
}
......
package eu.dariah.de.search.crawling.crawler;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.joda.time.Period;
import org.slf4j.MDC;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContextAware;
import de.unibamberg.minf.dme.model.datamodel.NonterminalImpl;
import de.unibamberg.minf.processing.consumption.ResourceConsumptionService;
import de.unibamberg.minf.processing.exception.ProcessingConfigException;
import de.unibamberg.minf.processing.service.MatchingFileCollector;
import de.unibamberg.minf.processing.service.ParallelFileProcessingService;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.service.CrawlService;
public class FileProcessor extends ParallelFileProcessingService implements Processor, ResourceConsumptionService {
public class FileProcessor extends ParallelFileProcessingService implements Processor, ResourceConsumptionService, InitializingBean {
@Autowired private CrawlService crawlService;
@Autowired private List<String> antiPatterns;
private boolean initialized = false;
......@@ -35,6 +47,12 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc
return super.isInitialized() && initialized;
}
@SuppressWarnings("unchecked")
@Override
public void afterPropertiesSet() throws Exception {
this.antiPatterns = (List<String>)applicationContext.getBean("antiPatterns");
}
@Override
public void run() {
MDC.put("uid", crawlId);
......@@ -50,6 +68,7 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc
this.setSchema(sc.getModel());
this.crawlId = crawl.getId();
// An original offline crawl
if (crawl.getBaseCrawlId()!=null) {
this.setPath(crawlService.getCrawlDirPath(crawl.getBaseCrawlId()));
......@@ -58,12 +77,17 @@ public class FileProcessor extends ParallelFileProcessingService implements Proc
else {
this.setPath(crawlService.getCrawlDirPath(crawl.getId()));
}
this.initialized = true;
try {
if (endpoint.getPatterns()!=null) {
this.setFileCollector(new MatchingFileCollector(Paths.get(this.getPath()), endpoint.getPatterns()));
this.getFileCollector().setAntiPatternStrings(antiPatterns);
}
super.init();
} catch (ProcessingConfigException e) {
logger.error("Failed to initialize XML processing", e);
} catch (ProcessingConfigException | IOException e) {
logger.error("Failed to initialize file processing", e);
}
}
}
package eu.dariah.de.search.crawling.crawler;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.slf4j.MDC;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import de.unibamberg.minf.processing.git.adapter.GitRepositoryAdapter;
import de.unibamberg.minf.processing.git.service.GitRepositoryProcessingService;
import eu.dariah.de.search.model.Crawl;
import eu.dariah.de.search.model.Endpoint;
import eu.dariah.de.search.model.ExtendedDatamodelContainer;
import eu.dariah.de.search.service.CrawlService;
public class GitCrawlerImpl extends GitRepositoryProcessingService implements Crawler, ApplicationContextAware {
@Autowired private CrawlService crawlService;
private boolean initialized = false;
private String crawlId;
@Override
public String getUnitMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.git_crawling.unit";
}
@Override
public String getTitleMessageCode() {
return "~eu.dariah.de.minfba.search.crawling.git_crawling.title";
}
@Override
public boolean isInitialized() {
return super.isInitialized() && initialized;
}
@Override
public void run() {
MDC.put("uid", crawlId);
File cDir = new File(this.getCrawlDir());
if (cDir.exists()) {
FileUtils.deleteQuietly(cDir);
}
super.run();
}
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.setAdapter(applicationContext.getBean(GitRepositoryAdapter.class));
}
@Override
public void init(Endpoint endpoint, Crawl crawl, ExtendedDatamodelContainer sc) {
this.setUrl(endpoint.getUrl());
this.setBranch(endpoint.getSet());
this.crawlId = crawl.getId();
this.setCrawlDir(crawlService.getCrawlDirPath(crawl));
this.initialized = true;
}
}
\ No newline at end of file
package eu.dariah.de.search.dao;
import de.unibamberg.minf.dme.model.version.VersionInfo;
import eu.dariah.de.search.dao.base.MongoDao;
public interface VersionDao extends MongoDao<VersionInfo> { }
package eu.dariah.de.search.dao;
import org.springframework.stereotype.Repository;
import de.unibamberg.minf.dme.model.version.VersionInfo;
import eu.dariah.de.search.dao.base.BaseMongoDaoImpl;
@Repository
public class VersionDaoImpl extends BaseMongoDaoImpl<VersionInfo> implements VersionDao {
public VersionDaoImpl() {
super(VersionInfo.class);
}
}
......@@ -13,8 +13,13 @@ public class Endpoint implements Identifiable {
private List<Dataset> datasets;
private List<String> patterns;
private String url;
private String method;
private String accessType;
private String fileType;
private String set;
private String dateTimeFormatPattern;
......@@ -42,8 +47,11 @@ public class Endpoint implements Identifiable {
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public String getMethod() { return method; }
public void setMethod(String method) { this.method = method; }
public String getAccessType() { return accessType; }
public void setAccessType(String accessType) { this.accessType = accessType; }
public String getFileType() { return fileType; }
public void setFileType(String fileType) { this.fileType = fileType; }
public String getSet() { return set; }
public void setSet(String set) { this.set = set; }
......@@ -59,4 +67,7 @@ public class Endpoint implements Identifiable {
public String getUpdatePeriod() { return updatePeriod; }
public void setUpdatePeriod(String updatePeriod) { this.updatePeriod = updatePeriod; }
public List<String> getPatterns() { return patterns; }
public void setPatterns(List<String> patterns) { this.patterns = patterns; }
}
\ No newline at end of file
......@@ -9,7 +9,8 @@ public class EndpointPojo implements Identifiable {
private String id;
private String url;
private String method;
private String accessType;
private String fileType;
private String set;
private List<DatasetPojo> datasetPojos;
......@@ -27,8 +28,11 @@ public class EndpointPojo implements Identifiable {
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public String getMethod() { return method; }
public void setMethod(String method) { this.method = method; }
public String getAccessType() { return accessType; }
public void setAccessType(String accessType) { this.accessType = accessType; }
public String getFileType() { return fileType; }
public void setFileType(String fileType) { this.fileType = fileType; }
public String getSet() { return set; }
public void setSet(String set) { this.set = set; }
......
......@@ -25,7 +25,8 @@ public class EndpointConverter extends BaseConverter<Endpoint, EndpointPojo> {
EndpointPojo ePojo = new EndpointPojo();
ePojo.setId(endpoint.getId());
ePojo.setUnprocessed(endpoint.isNew());
ePojo.setMethod(endpoint.getMethod());
ePojo.setAccessType(endpoint.getAccessType());
ePojo.setFileType(endpoint.getFileType());
ePojo.setSet(endpoint.getSet());
ePojo.setUrl(endpoint.getUrl());
ePojo.setUnaccessible(endpoint.isUnaccessible());
......
......@@ -162,7 +162,7 @@ public class SruQueryExecutionServiceImpl extends BaseResultService implements I
QueryResultDatasource qrd;
for (Collection c : coll) {
for (Endpoint e : c.getEndpoints()) {
if (e.getMethod().equals("OPAC")) {
if (e.getAccessType().equals("OPAC")) {
qrd = new QueryResultDatasource();
qrd.setProviderName(c.getName(locale.getISO3Language()));
qrd.setProviderId(c.getId());
......
package eu.dariah.de.search.updates;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.attribute.FileAttribute;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.mongodb.core.CollectionCallback;
import org.springframework.data.mongodb.core.MongoTemplate;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.node.TextNode;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import de.unibamberg.minf.dme.model.version.VersionInfo;
import de.unibamberg.minf.dme.model.version.VersionInfoImpl;
import eu.dariah.de.search.dao.VersionDao;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
public class UpdateServiceImpl implements InitializingBean {
private final static String versionHashPrefix = "GenericSearch";
private String backupsBasePath;
private String database;
private final MessageDigest md;
@Autowired private MongoTemplate mongoTemplate;
@Autowired private ObjectMapper objectMapper;
@Autowired private VersionDao versionDao;
public UpdateServiceImpl() throws NoSuchAlgorithmException {
md = MessageDigest.getInstance("MD5");
}
@Override
public void afterPropertiesSet() throws Exception {