Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
dariah
search
Commits
123389b5
Commit
123389b5
authored
May 03, 2021
by
Gradl, Tobias
Browse files
420: Implement configurable crawls -> [GS: Repetitive Crawl Model]
(OPENED) Task-Url:
#420
parent
2dfbcb90
Changes
9
Hide whitespace changes
Inline
Side-by-side
build.gradle
View file @
123389b5
...
...
@@ -21,7 +21,7 @@ allprojects {
coreVersion
=
"6.1-SNAPSHOT"
gtfVersion
=
"2.0.0-SNAPSHOT"
processingVersion
=
"4.1.0-SNAPSHOT"
colregModelVersion
=
"4.3.
2
-RELEASE"
colregModelVersion
=
"4.3.
4
-RELEASE"
dariahSpVersion
=
"2.1.4-SNAPSHOT"
jsonAssertVersion
=
"1.5.0"
...
...
search-core/src/main/java/eu/dariah/de/search/api/client/CollectionSyncClient.java
View file @
123389b5
...
...
@@ -182,6 +182,8 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
if
(
cCurrent
.
getEndpoints
()!=
null
)
{
for
(
Endpoint
eCurrent
:
cCurrent
.
getEndpoints
())
{
if
(
this
.
endpointsAreSame
(
eCurrent
,
eFetched
))
{
eCurrent
.
setAccessModelId
(
eFetched
.
getAccessModelId
());
mergeOrUnchanged
=
true
;
this
.
mergeDatamodelReferences
(
eCurrent
,
eFetched
);
deleteEndpoints
.
remove
(
eCurrent
);
...
...
@@ -274,7 +276,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
Period
shortestPeriod
=
null
;
Period
updatePeriod
;
for
(
AccrualPojo
accrual
:
fetchedCollection
.
getAccrualPojos
())
{
if
(
this
.
knownUpdatePolicies
.
contains
(
accrual
.
getAccrualPolicy
()))
{
if
(
this
.
knownUpdatePolicies
!=
null
&&
this
.
knownUpdatePolicies
.
contains
(
accrual
.
getAccrualPolicy
()))
{
updatePeriod
=
this
.
updateFrequencyMap
.
get
(
accrual
.
getAccrualPeriodicity
());
if
(
updatePeriod
==
null
)
{
updatePeriod
=
this
.
updateFrequencyMap
.
get
(
defaultUnclosedFrequencyKey
);
...
...
@@ -346,6 +348,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
}
}
e
.
setAccessModelId
(
accessPojo
.
getAccessModelId
());
e
.
setUrl
(
accessPojo
.
getUri
());
e
.
setPatterns
(
accessPojo
.
getPatterns
());
...
...
@@ -394,7 +397,7 @@ public class CollectionSyncClient extends BaseApiClientImpl<CollectionApiPojo, E
return
false
;
}
}
private
boolean
datasetsAreSame
(
Dataset
ds1
,
Dataset
ds2
)
{
if
(!
ds1
.
getId
().
equals
(
ds2
.
getId
()))
{
return
false
;
...
...
search-core/src/main/java/eu/dariah/de/search/config/nested/DatamodelConfigProperties.java
View file @
123389b5
...
...
@@ -10,6 +10,7 @@ public class DatamodelConfigProperties {
private
String
presentation
;
private
String
indexing
;
private
String
metadata
;
private
String
crawling
;
private
String
oaidcModel
;
private
List
<
String
>
modelsWithMessageCodes
;
private
String
modelsMessageCodePrefix
;
...
...
search-core/src/main/java/eu/dariah/de/search/crawling/crawler/RepetitiveFileCrawlerImpl.java
0 → 100644
View file @
123389b5
package
eu.dariah.de.search.crawling.crawler
;
import
java.io.File
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.UUID
;
import
org.springframework.beans.factory.InitializingBean
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
eu.dariah.de.search.config.MainConfigProperties
;
import
eu.dariah.de.search.crawling.files.FileDownloader
;
import
eu.dariah.de.search.model.Crawl
;
import
eu.dariah.de.search.model.Endpoint
;
import
eu.dariah.de.search.model.ExtendedDatamodelContainer
;
import
eu.dariah.de.search.model.ExtendedMappingContainer
;
import
eu.dariah.de.search.service.DatamodelService
;
import
eu.dariah.de.search.service.MappingService
;
import
lombok.Getter
;
import
lombok.Setter
;
public
class
RepetitiveFileCrawlerImpl
extends
FileDownloader
implements
InitializingBean
{
@Autowired
private
MainConfigProperties
mainConfig
;
@Autowired
protected
DatamodelService
datamodelService
;
@Autowired
private
MappingService
mappingService
;
@Getter
@Setter
Map
<
String
,
String
>
fileProcessingServiceMap
;
private
List
<
String
>
handledUrls
;
private
ExtendedMappingContainer
mapping
=
null
;
// 1. Use downloader to get first file
// 2. Get a mapping
// 3. Execute the mapping
@Override
public
String
getUnitMessageCode
()
{
return
"~eu.dariah.de.minfba.search.crawling.file.crawler.unit"
;
}
@Override
public
String
getTitleMessageCode
()
{
return
"~eu.dariah.de.minfba.search.crawling.file.crawler.title"
;
}
@Override
public
void
afterPropertiesSet
()
throws
Exception
{
this
.
fileName
=
UUID
.
randomUUID
().
toString
();
}
@Override
public
void
init
(
Endpoint
endpoint
,
Crawl
crawl
,
ExtendedDatamodelContainer
sc
)
{
super
.
init
(
endpoint
,
crawl
,
sc
);
endpoint
.
getFileType
();
endpoint
.
getAccessModelId
();
this
.
handledUrls
=
new
ArrayList
<>();
if
(
mainConfig
.
getDatamodels
().
getCrawling
()==
null
)
{
logger
.
warn
(
"No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable"
);
}
else
{
if
(
endpoint
.
getAccessModelId
()!=
null
)
{
logger
.
info
(
"Dedicated access modell configured: {}"
,
endpoint
.
getAccessModelId
());
mapping
=
mappingService
.
getMappingBySourceAndTarget
(
endpoint
.
getAccessModelId
(),
mainConfig
.
getDatamodels
().
getCrawling
());
}
else
{
logger
.
info
(
"No dedicated access modell, using datamodel: {}"
,
sc
.
getModel
().
getId
());
mapping
=
mappingService
.
getMappingBySourceAndTarget
(
sc
.
getModel
().
getId
(),
mainConfig
.
getDatamodels
().
getCrawling
());
}
if
(
mapping
==
null
)
{
logger
.
info
(
"No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured"
);
}
}
}
@Override
public
void
downloadFile
()
{
super
.
downloadFile
();
File
f
=
new
File
(
this
.
getOutputPath
());
if
(
f
.
exists
())
{
logger
.
debug
(
"file exists: {}"
,
f
.
getAbsolutePath
());
}
}
@Override
protected
String
getOutputFilename
()
{
return
fileName
+
"."
+
(
handledUrls
==
null
?
0
:
handledUrls
.
size
());
}
}
search-core/src/main/java/eu/dariah/de/search/crawling/files/FileDownloader.java
View file @
123389b5
...
...
@@ -11,8 +11,6 @@ import java.net.URL;
import
java.nio.channels.Channels
;
import
java.nio.channels.ReadableByteChannel
;
import
org.springframework.beans.factory.annotation.Value
;
import
de.unibamberg.minf.processing.exception.ResourceProcessingException
;
import
eu.dariah.de.search.crawling.crawler.Crawler
;
import
eu.dariah.de.search.model.Crawl
;
...
...
@@ -21,15 +19,14 @@ import eu.dariah.de.search.model.ExtendedDatamodelContainer;
public
class
FileDownloader
extends
BaseFileStreamCrawler
implements
Crawler
{
@Value
(
"${processing.download.filename:download.tmp}"
)
private
String
fileName
;
protected
String
fileName
;
pr
ivate
URI
inputURI
;
pr
otected
URI
inputURI
;
private
int
bufferSize
=
1024
;
private
int
chunkSize
=
1048576
;
pr
ivate
boolean
initialized
=
false
;
pr
otected
boolean
initialized
=
false
;
public
int
getBufferSize
()
{
return
bufferSize
;
}
...
...
search-core/src/main/java/eu/dariah/de/search/model/Endpoint.java
View file @
123389b5
...
...
@@ -27,6 +27,8 @@ public class Endpoint implements Identifiable {
private
String
url
;
private
String
accessType
;
private
String
accessModelId
;
private
String
fileType
;
private
String
dateTimeFormatPattern
;
...
...
search-ui/src/main/java/eu/dariah/de/search/config/CrawlingConfig.java
View file @
123389b5
...
...
@@ -27,6 +27,7 @@ import eu.dariah.de.search.crawling.crawler.FileProcessor;
import
eu.dariah.de.search.crawling.crawler.GitCrawlerImpl
;
import
eu.dariah.de.search.crawling.crawler.IndexCleaner
;
import
eu.dariah.de.search.crawling.crawler.OaiPmhCrawlerImpl
;
import
eu.dariah.de.search.crawling.crawler.RepetitiveFileCrawlerImpl
;
import
eu.dariah.de.search.crawling.files.FileDownloader
;
import
eu.dariah.de.search.crawling.files.FileUnarchiver
;
import
eu.dariah.de.search.crawling.files.FileUnpacker
;
...
...
@@ -63,7 +64,7 @@ public class CrawlingConfig extends CrawlingConfigProperties {
Map
<
String
,
String
>
accessChainMap
=
new
HashMap
<>();
accessChainMap
.
put
(
"OAI-PMH"
,
"oaiPmhCrawler"
);
accessChainMap
.
put
(
"Git Repository"
,
"gitCrawler"
);
accessChainMap
.
put
(
"Online file"
,
"file
Download
er"
);
accessChainMap
.
put
(
"Online file"
,
"file
Crawl
er"
);
crawlManager
.
setAccessChains
(
accessChainMap
);
Map
<
String
,
String
>
fileProcessingChainMap
=
new
HashMap
<>();
...
...
@@ -105,8 +106,17 @@ public class CrawlingConfig extends CrawlingConfigProperties {
@Bean
@Scope
(
"prototype"
)
public
FileDownloader
fileDownloader
()
{
return
new
FileDownloader
();
public
RepetitiveFileCrawlerImpl
fileCrawler
()
{
RepetitiveFileCrawlerImpl
fileCrawler
=
new
RepetitiveFileCrawlerImpl
();
Map
<
String
,
String
>
fileProcessingServiceMap
=
new
HashMap
<>();
fileProcessingServiceMap
.
put
(
"XML"
,
"xmlStringProcessor"
);
fileProcessingServiceMap
.
put
(
"JSON"
,
"jsonProcessingService"
);
fileProcessingServiceMap
.
put
(
"CSV"
,
"csvStringProcessor"
);
fileProcessingServiceMap
.
put
(
"TSV"
,
"tsvStringProcessor"
);
fileProcessingServiceMap
.
put
(
"TEXT"
,
"textStringProcessor"
);
fileCrawler
.
setFileProcessingServiceMap
(
fileProcessingServiceMap
);
return
fileCrawler
;
}
@Bean
...
...
@@ -233,4 +243,37 @@ public class CrawlingConfig extends CrawlingConfigProperties {
fileProcessingAntiPatterns
.
add
(
".git/**"
);
return
fileProcessingAntiPatterns
;
}
@Bean
public
Map
<
String
,
String
>
updateFrequencyMap
()
{
Map
<
String
,
String
>
updateFrequencyMap
=
new
HashMap
<>();
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/triennial"
,
"P3M"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/biennial"
,
"P3M"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/annual"
,
"P3M"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/semiannual"
,
"P2M"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/threeTimesAYear"
,
"P2M"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/quarterly"
,
"P1M"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/bimonthly"
,
"P1M"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/monthly"
,
"P2W"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/semimonthly"
,
"P2W"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/biweekly"
,
"P2W"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/threeTimesAMonth"
,
"P1W"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/weekly"
,
"P1W"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/semiweekly"
,
"P1W"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/threeTimesAWeek"
,
"P3D"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/daily"
,
"P3D"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/continuous"
,
"P3D"
);
updateFrequencyMap
.
put
(
"http://purl.org/cld/freq/completelyIrregular"
,
"P3D"
);
updateFrequencyMap
.
put
(
"_defaultUnclosed"
,
"P1M"
);
return
updateFrequencyMap
;
}
@Bean
public
List
<
String
>
knownUpdatePolicies
()
{
List
<
String
>
knownUpdatePolicies
=
new
ArrayList
<>();
knownUpdatePolicies
.
add
(
"http://purl.org/cld/accpol/passive"
);
knownUpdatePolicies
.
add
(
"http://purl.org/cld/accpol/active"
);
knownUpdatePolicies
.
add
(
"http://purl.org/cld/accpol/partial"
);
return
knownUpdatePolicies
;
}
}
search-ui/src/main/resources/application.yml
View file @
123389b5
...
...
@@ -47,6 +47,7 @@ datamodels:
indexing
:
5a27ab9b0246440d479c6c46
presentation
:
5b9004394c7b0405ccf90b00
metadata
:
5cd3f505dd960f585e960f63
crawling
:
60812e54667bd50488d2b237
# modelsWithMessageCodes:
# - 598f06e706bffc03a8e44f82
# - 598f06e706bffc03a8e44f83
...
...
settings.gradle
View file @
123389b5
...
...
@@ -11,7 +11,7 @@ includeBuild('../colreg') {
}
}*/
/*
includeBuild('../processing') {
includeBuild
(
'../processing'
)
{
dependencySubstitution
{
substitute
module
(
'de.unibamberg.minf.processing:processing-core'
)
with
project
(
':processing-core'
)
substitute
module
(
'de.unibamberg.minf.processing:processing-adapters'
)
with
project
(
':processing-adapters'
)
...
...
@@ -35,4 +35,4 @@ includeBuild('../core') {
dependencySubstitution
{
substitute
module
(
'de.unibamberg.minf.core:core-web'
)
with
project
(
':core-web'
)
}
}*/
\ No newline at end of file
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment