Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
dariah
search
Commits
79cdd252
Commit
79cdd252
authored
May 04, 2021
by
Gradl, Tobias
Browse files
420: Implement configurable crawls -> [GS: Repetitive Crawl Model]
(OPENED) Task-Url:
#420
parent
bd2d28bb
Changes
3
Hide whitespace changes
Inline
Side-by-side
search-core/src/main/java/eu/dariah/de/search/crawling/CrawlHelper.java
0 → 100644
View file @
79cdd252
package
eu.dariah.de.search.crawling
;
import
java.net.URISyntaxException
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.client.utils.URIBuilder
;
import
org.apache.http.message.BasicNameValuePair
;
import
de.unibamberg.minf.processing.model.base.Resource
;
import
de.unibamberg.minf.processing.model.helper.ResourceHelper
;
import
eu.dariah.de.search.model.Endpoint
;
import
eu.dariah.de.search.model.EndpointParam
;
import
lombok.extern.slf4j.Slf4j
;
@Slf4j
public
class
CrawlHelper
{
private
CrawlHelper
()
{}
public
static
String
renderAccessUrl
(
Endpoint
ep
)
{
return
renderAccessUrl
(
ep
.
getUrl
(),
ep
.
getParams
(),
null
);
}
public
static
String
renderAccessUrl
(
String
url
,
List
<
EndpointParam
>
endpointParams
,
List
<
Resource
>
dynamicParams
)
{
if
(
url
==
null
)
{
return
null
;
}
try
{
URIBuilder
b
=
new
URIBuilder
(
url
);
if
(
endpointParams
!=
null
)
{
for
(
EndpointParam
p
:
endpointParams
)
{
b
.
addParameter
(
p
.
getParam
(),
p
.
getValue
());
}
}
b
.
addParameters
(
createNameValuePairs
(
dynamicParams
));
return
b
.
build
().
toString
();
}
catch
(
URISyntaxException
e
)
{
log
.
error
(
"Failed to build URL"
,
e
);
}
return
null
;
}
private
static
List
<
NameValuePair
>
createNameValuePairs
(
List
<
Resource
>
dynamicParams
)
{
if
(
dynamicParams
==
null
)
{
return
new
ArrayList
<>();
}
List
<
NameValuePair
>
pairs
=
new
ArrayList
<>();
List
<
Resource
>
names
;
List
<
Resource
>
values
;
for
(
Resource
r
:
dynamicParams
)
{
names
=
ResourceHelper
.
findRecursive
(
r
,
"Name"
);
values
=
ResourceHelper
.
findRecursive
(
r
,
"Value"
);
for
(
Resource
name
:
names
)
{
if
(
name
.
getValue
()==
null
||
name
.
getValue
().
toString
().
trim
().
isEmpty
())
{
continue
;
}
for
(
Resource
value
:
values
)
{
if
(
value
.
getValue
()==
null
||
value
.
getValue
().
toString
().
trim
().
isEmpty
())
{
continue
;
}
pairs
.
add
(
new
BasicNameValuePair
(
name
.
getValue
().
toString
().
trim
(),
value
.
getValue
().
toString
().
trim
()));
}
}
}
return
pairs
;
}
}
\ No newline at end of file
search-core/src/main/java/eu/dariah/de/search/crawling/crawler/RepetitiveFileCrawlerImpl.java
View file @
79cdd252
package
eu.dariah.de.search.crawling.crawler
;
import
java.io.File
;
import
java.io.FileInputStream
;
import
java.io.FileNotFoundException
;
import
java.net.MalformedURLException
;
import
java.net.URISyntaxException
;
import
java.net.URL
;
import
java.util.ArrayList
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Queue
;
import
java.util.UUID
;
import
org.springframework.beans.BeansException
;
import
org.springframework.beans.factory.InitializingBean
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.context.ApplicationContext
;
import
org.springframework.context.ApplicationContextAware
;
import
de.unibamberg.minf.core.util.Stopwatch
;
import
de.unibamberg.minf.dme.model.base.Grammar
;
import
de.unibamberg.minf.dme.model.base.Nonterminal
;
import
de.unibamberg.minf.dme.model.function.FunctionImpl
;
...
...
@@ -26,7 +30,6 @@ import de.unibamberg.minf.processing.exception.ProcessingConfigException;
import
de.unibamberg.minf.processing.model.base.Resource
;
import
de.unibamberg.minf.processing.model.helper.ResourceHelper
;
import
de.unibamberg.minf.processing.service.base.BaseResourceProcessingServiceImpl
;
import
de.unibamberg.minf.processing.service.base.ProcessingService
;
import
eu.dariah.de.search.config.MainConfigProperties
;
import
eu.dariah.de.search.crawling.CrawlHelper
;
import
eu.dariah.de.search.crawling.files.FileDownloader
;
...
...
@@ -51,33 +54,28 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
Map
<
String
,
String
>
fileProcessingServiceMap
;
@Getter
@Setter
private
int
politenessTimespan
=
1000
;
// 1 call per second (to same endpoint)
BaseResourceProcessingServiceImpl
processingService
;
CollectingResourceConsumptionServiceImpl
sourceResCollector
=
null
;
MappingExecGroup
mExecGroup
;
CollectingResourceConsumptionServiceImpl
targetResCollector
=
null
;
private
List
<
String
>
handledUrls
;
private
ExtendedMappingContainer
mapping
=
null
;
private
ExtendedMappingContainer
mapping
;
private
Endpoint
endpoint
;
private
Crawl
crawl
;
private
ExtendedDatamodelContainer
sourceDatamodelContainer
;
private
ExtendedDatamodelContainer
targetDatamodelContainer
;
// 1. Use downloader to get first file
// 2. Get a mapping
// 3. Execute the mapping
@Override
public
String
getUnitMessageCode
()
{
return
"~eu.dariah.de.minfba.search.crawling.file.crawler.unit"
;
}
@Override
public
String
getTitleMessageCode
()
{
return
"~eu.dariah.de.minfba.search.crawling.file.crawler.title"
;
}
List
<
String
>
processedUrls
;
Queue
<
String
>
downloadUris
;
Stopwatch
swPoliteness
=
new
Stopwatch
();
@Override
public
void
setApplicationContext
(
ApplicationContext
applicationContext
)
throws
BeansException
{
this
.
appContext
=
applicationContext
;
...
...
@@ -89,96 +87,107 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
super
.
init
(
endpoint
,
crawl
,
sc
);
this
.
endpoint
=
endpoint
;
this
.
crawl
=
crawl
;
processingService
=
BaseResourceProcessingServiceImpl
.
class
.
cast
(
appContext
.
getBean
(
fileProcessingServiceMap
.
get
(
endpoint
.
getFileType
())));
targetDatamodelContainer
=
datamodelService
.
findById
(
mainConfig
.
getDatamodels
().
getCrawling
());
this
.
handledUrls
=
new
ArrayList
<>();
if
(
mainConfig
.
getDatamodels
().
getCrawling
()==
null
)
{
logger
.
warn
(
"No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable"
);
}
else
{
if
(
endpoint
.
getAccessModelId
()!=
null
)
{
logger
.
info
(
"Dedicated access modell configured: {}"
,
endpoint
.
getAccessModelId
());
sourceDatamodelContainer
=
datamodelService
.
findById
(
endpoint
.
getAccessModelId
());
}
else
{
logger
.
info
(
"No dedicated access modell, using datamodel: {}"
,
sc
.
getModel
().
getId
());
sourceDatamodelContainer
=
sc
;
}
mapping
=
mappingService
.
getMappingBySourceAndTarget
(
sourceDatamodelContainer
.
getModel
().
getId
(),
mainConfig
.
getDatamodels
().
getCrawling
());
if
(
mapping
==
null
)
{
logger
.
info
(
"No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured"
);
}
this
.
initCrawlModels
(
sc
);
if
(
mapping
==
null
)
{
return
;
}
this
.
initServices
();
this
.
processedUrls
=
new
ArrayList
<>();
this
.
downloadUris
=
new
LinkedList
<>();
}
@Override
public
void
downloadFile
()
{
swPoliteness
.
start
();
super
.
downloadFile
();
File
f
=
new
File
(
this
.
getOutputPath
());
if
(
f
.
exists
())
{
logger
.
debug
(
"file exists: {}"
,
f
.
getAbsolutePath
());
if
(
this
.
mExecGroup
==
null
)
{
logger
.
debug
(
"Resumptive crawling not applicable -> crawl done"
);
return
;
}
CollectingResourceConsumptionServiceImpl
collector
=
new
CollectingResourceConsumptionServiceImpl
();
this
.
reset
();
this
.
processDownloadedFile
();
try
{
processingService
.
setSchema
(
sourceDatamodelContainer
.
getModel
());
processingService
.
setRoot
((
Nonterminal
)
sourceDatamodelContainer
.
getOrRenderElementHierarchy
());
processingService
.
setInputStream
(
new
FileInputStream
(
f
));
processingService
.
init
();
processingService
.
addConsumptionService
(
collector
);
processingService
.
run
();
}
catch
(
ProcessingConfigException
|
FileNotFoundException
e
)
{
logger
.
error
(
"Exception while processing"
,
e
);
}
logger
.
info
(
"Resources counted:"
+
collector
.
getResources
().
size
());
MappingExecGroup
mex
=
this
.
buildMappingExecutionGroup
(
mapping
,
targetDatamodelContainer
);
this
.
collectDownloadUris
();
this
.
setupAndDownloadNextFile
();
}
private
void
initCrawlModels
(
ExtendedDatamodelContainer
sc
)
{
mapping
=
null
;
CollectingResourceConsumptionServiceImpl
collector2
=
new
CollectingResourceConsumptionServiceImpl
();
mappingExecutionService
.
addConsumptionService
(
collector2
);
// Crawl model available
if
(
mainConfig
.
getDatamodels
().
getCrawling
()==
null
)
{
logger
.
warn
(
"No GS: Repetitive Crawl Model configured; repetitive file crawling unavailable"
);
return
;
}
if
(
mex
.
getConcepts
()!=
null
||
mex
.
getConcepts
().
size
()>
0
)
{
try
{
mappingExecutionService
.
init
(
mex
,
collector
.
getResources
());
mappingExecutionService
.
run
();
}
catch
(
ProcessingConfigException
e
)
{
logger
.
error
(
"Failed to initialize MappingExecutionService"
,
e
);
// Dedicated access model or datamodel?
if
(
endpoint
.
getAccessModelId
()!=
null
)
{
logger
.
debug
(
"Dedicated access modell configured: {}"
,
endpoint
.
getAccessModelId
());
sourceDatamodelContainer
=
datamodelService
.
findById
(
endpoint
.
getAccessModelId
());
if
(
sourceDatamodelContainer
==
null
)
{
logger
.
warn
(
"Dedicated access modell configured but not available (Sync with DME required?)"
);
return
;
}
}
else
{
logger
.
debug
(
"No dedicated access modell, using datamodel: {}"
,
sc
.
getModel
().
getId
());
sourceDatamodelContainer
=
sc
;
}
logger
.
info
(
"Trans resources counted:"
+
collector2
.
getResources
().
size
());
// Crawl model
targetDatamodelContainer
=
datamodelService
.
findById
(
mainConfig
.
getDatamodels
().
getCrawling
());
if
(
targetDatamodelContainer
==
null
)
{
logger
.
warn
(
"Crawl modell configured but not available (Sync with DME required?)"
);
return
;
}
// Mapping between source (access or data) model and target crawl model
mapping
=
mappingService
.
getMappingBySourceAndTarget
(
sourceDatamodelContainer
.
getModel
().
getId
(),
mainConfig
.
getDatamodels
().
getCrawling
());
if
(
mapping
==
null
)
{
logger
.
info
(
"No mapping to GS: Repetitive Crawl Model modeled; repetitive file crawling not configured"
);
}
}
private
void
initServices
()
{
processingService
=
null
;
if
(
collector2
.
getResources
()!=
null
)
{
// Setup everything that is needed for file processing
if
(!
fileProcessingServiceMap
.
containsKey
(
endpoint
.
getFileType
()))
{
logger
.
warn
(
"Endpoint file type unsupported by repetitive crawling: {}"
,
endpoint
.
getFileType
());
return
;
}
try
{
processingService
=
BaseResourceProcessingServiceImpl
.
class
.
cast
(
appContext
.
getBean
(
fileProcessingServiceMap
.
get
(
endpoint
.
getFileType
())));
processingService
.
setSchema
(
sourceDatamodelContainer
.
getModel
());
processingService
.
setRoot
((
Nonterminal
)
sourceDatamodelContainer
.
getOrRenderElementHierarchy
());
for
(
Resource
r
:
collector2
.
getResources
())
{
List
<
Resource
>
res
=
ResourceHelper
.
findRecursive
(
r
,
"GET.Param"
);
String
newUrl
=
CrawlHelper
.
renderAccessUrl
(
this
.
endpoint
.
getUrl
(),
this
.
endpoint
.
getParams
(),
res
);
// TODO: Next download from here...
logger
.
debug
(
newUrl
);
}
sourceResCollector
=
new
CollectingResourceConsumptionServiceImpl
();
processingService
.
addConsumptionService
(
sourceResCollector
);
}
catch
(
Exception
e
)
{
logger
.
warn
(
"No supporting processing service available for file type: {}"
,
endpoint
.
getFileType
());
return
;
}
// Mapping execution
mExecGroup
=
this
.
buildMappingExecutionGroup
(
mapping
,
targetDatamodelContainer
);
targetResCollector
=
new
CollectingResourceConsumptionServiceImpl
();
mappingExecutionService
.
addConsumptionService
(
targetResCollector
);
if
(
mExecGroup
.
getConcepts
()==
null
||
mExecGroup
.
getConcepts
().
isEmpty
())
{
mExecGroup
=
null
;
}
}
p
ublic
MappingExecGroup
buildMappingExecutionGroup
(
ExtendedMappingContainer
mc
,
ExtendedDatamodelContainer
scTarget
)
{
p
rivate
MappingExecGroup
buildMappingExecutionGroup
(
ExtendedMappingContainer
mc
,
ExtendedDatamodelContainer
scTarget
)
{
if
(
mc
==
null
)
{
return
null
;
}
MappingExecGroup
exec
=
new
MappingExecGroup
();
exec
.
setMapping
(
mc
.
getMapping
());
exec
.
setTargetSchemaId
(
mc
.
getMapping
().
getTargetId
());
exec
.
setTargetElementTree
(
scTarget
.
getOrRenderElementHierarchy
());
for
(
MappedConcept
c
:
mc
.
getMapping
().
getConcepts
())
{
...
...
@@ -208,9 +217,104 @@ public class RepetitiveFileCrawlerImpl extends FileDownloader implements Applica
}
return
exec
;
}
private
void
reset
()
{
if
(
sourceResCollector
!=
null
)
{
sourceResCollector
.
setResources
(
null
);
}
if
(
targetResCollector
!=
null
)
{
targetResCollector
.
setResources
(
null
);
}
}
private
void
processDownloadedFile
()
{
logger
.
debug
(
"Processing downloaded file: {}"
,
this
.
getOutputPath
());
try
{
processingService
.
setInputStream
(
new
FileInputStream
(
this
.
getOutputPath
()));
processingService
.
init
();
processingService
.
run
();
}
catch
(
ProcessingConfigException
|
FileNotFoundException
e
)
{
logger
.
error
(
"Exception while processing"
,
e
);
}
if
(
sourceResCollector
.
getResources
()!=
null
&&
!
sourceResCollector
.
getResources
().
isEmpty
())
{
try
{
mappingExecutionService
.
init
(
mExecGroup
,
sourceResCollector
.
getResources
());
mappingExecutionService
.
run
();
}
catch
(
ProcessingConfigException
e
)
{
logger
.
error
(
"Failed to initialize MappingExecutionService"
,
e
);
}
}
logger
.
debug
(
"Mapping execution transformed {} to {} resources"
,
sourceResCollector
.
getResources
()==
null
?
0
:
sourceResCollector
.
getResources
().
size
(),
targetResCollector
.
getResources
()==
null
?
0
:
targetResCollector
.
getResources
().
size
());
}
private
void
collectDownloadUris
()
{
List
<
Resource
>
res
;
String
newUrl
;
if
(
targetResCollector
.
getResources
()!=
null
)
{
for
(
Resource
r
:
targetResCollector
.
getResources
())
{
res
=
ResourceHelper
.
findRecursive
(
r
,
"GET.Param"
);
newUrl
=
CrawlHelper
.
renderAccessUrl
(
this
.
endpoint
.
getUrl
(),
this
.
endpoint
.
getParams
(),
res
);
if
(!
processedUrls
.
contains
(
newUrl
))
{
processedUrls
.
add
(
newUrl
);
downloadUris
.
add
(
newUrl
);
}
}
}
}
private
void
setupAndDownloadNextFile
()
{
if
(
downloadUris
.
isEmpty
())
{
logger
.
debug
(
"Download queue is empty -> file crawling is complete"
);
return
;
}
try
{
this
.
setupPaths
(
this
.
crawl
);
}
catch
(
MalformedURLException
e
)
{
logger
.
error
(
"Failed to setup paths for crawl"
,
e
);
return
;
}
this
.
inputURI
=
null
;
try
{
String
downloadUri
=
downloadUris
.
remove
();
logger
.
debug
(
"Processing next downloadURI: {}"
,
downloadUri
);
this
.
inputURI
=
new
URL
(
downloadUri
).
toURI
();
}
catch
(
MalformedURLException
|
URISyntaxException
e
)
{
logger
.
error
(
"Failed to setup and download next file URI"
,
e
);
}
// If next inputURI is setup -> download, else continue with next download URI
if
(
this
.
inputURI
!=
null
)
{
this
.
continueDownload
();
}
else
{
this
.
setupAndDownloadNextFile
();
}
}
private
void
continueDownload
()
{
long
sleepTs
=
this
.
politenessTimespan
-
swPoliteness
.
getElapsedTime
();
if
(
sleepTs
>
0
)
{
if
(
logger
.
isDebugEnabled
())
{
logger
.
debug
(
String
.
format
(
"Crawl sleeping %sms due to politeness setting (%sms)"
,
sleepTs
,
this
.
getPolitenessTimespan
()));
}
try
{
Thread
.
sleep
(
sleepTs
);
this
.
downloadFile
();
}
catch
(
InterruptedException
e
)
{
logger
.
error
(
"Thead.sleep interrupted"
,
e
);
Thread
.
currentThread
().
interrupt
();
}
}
else
{
this
.
downloadFile
();
}
}
@Override
protected
String
getOutputFilename
()
{
return
fileName
+
"."
+
(
handl
edUrls
==
null
?
0
:
handl
edUrls
.
size
());
return
fileName
+
"."
+
(
process
edUrls
==
null
?
0
:
process
edUrls
.
size
());
}
}
\ No newline at end of file
search-core/src/main/java/eu/dariah/de/search/crawling/files/FileDownloader.java
View file @
79cdd252
...
...
@@ -10,21 +10,11 @@ import java.net.URISyntaxException;
import
java.net.URL
;
import
java.nio.channels.Channels
;
import
java.nio.channels.ReadableByteChannel
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.client.utils.URIBuilder
;
import
org.apache.http.message.BasicNameValuePair
;
import
de.unibamberg.minf.processing.exception.ResourceProcessingException
;
import
de.unibamberg.minf.processing.model.base.Resource
;
import
de.unibamberg.minf.processing.model.helper.ResourceHelper
;
import
eu.dariah.de.search.crawling.CrawlHelper
;
import
eu.dariah.de.search.crawling.crawler.Crawler
;
import
eu.dariah.de.search.model.Crawl
;
import
eu.dariah.de.search.model.Endpoint
;
import
eu.dariah.de.search.model.EndpointParam
;
import
eu.dariah.de.search.model.ExtendedDatamodelContainer
;
public
class
FileDownloader
extends
BaseFileStreamCrawler
implements
Crawler
{
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment