Spaces:
Running
Running
version: "0.8.0" | |
corpusPath: "./resources/dataset/dataseer/corpus" | |
templatePath: "./resources/dataset/dataseer/crfpp-templates/dataseer.template" | |
grobidHome: "/opt/grobid/grobid-home" | |
tmpPath: "/opt/grobid/grobid-home/tmp/" | |
# path to Pub2TEI repository as available at https://github.com/kermitt2/Pub2TEI | |
pub2teiPath: "/opt/Pub2TEI/" | |
gluttonHost: "https://cloud.science-miner.com/glutton" | |
gluttonPort: | |
# entity-fishing server information for performing entity disambiguation | |
# for https, indicate 443 as port | |
entityFishingHost: notanumber.com | |
entityFishingPort: 443 | |
#entityFishingHost: localhost | |
#entityFishingPort: 8090 | |
# if true we use binary classifiers for the contexts, otherwise use a single multi-label classifier | |
# binary classifiers perform better, but havier to use | |
useBinaryContextClassifiers: false | |
# sequence labeling model (identify data-related sections) | |
models: | |
# model for zones | |
- name: "dataseer" | |
engine: "wapiti" | |
#engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.00001 | |
window: 20 | |
nbMaxIterations: 2000 | |
# classifier model, dataset binary (datset or not dataset in the current sentence) | |
- name: "dataseer-binary" | |
engine: "delft" | |
delft: | |
# deep learning parameters | |
#architecture: "gru" | |
architecture: "bert" | |
#embeddings_name: "word2vec" | |
transformer: "allenai/scibert_scivocab_cased" | |
# identification of the data type (first level hierarchy) | |
- name: "dataseer-first" | |
engine: "delft" | |
delft: | |
# deep learning parameters | |
#architecture: "gru" | |
architecture: "bert" | |
#embeddings_name: "word2vec" | |
transformer: "allenai/scibert_scivocab_cased" | |
# mention context classification (reuse binary for the moment) | |
- name: "dataseer-reuse" | |
engine: "delft" | |
delft: | |
# deep learning parameters | |
#architecture: "gru" | |
architecture: "bert" | |
#embeddings_name: "word2vec" | |
transformer: "allenai/scibert_scivocab_cased" | |
# model for dataset mention recognition | |
- name: "datasets" | |
#engine: "wapiti" | |
engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.00001 | |
window: 20 | |
nbMaxIterations: 2000 | |
delft: | |
# deep learning parameters | |
#architecture: "BidLSTM_CRF" | |
architecture: "BERT_CRF" | |
#transformer: "allenai/scibert_scivocab_cased" | |
transformer: "michiyasunaga/LinkBERT-basecased" | |
#useELMo: true | |
#embeddings_name: "glove-840B" | |
runtime: | |
# parameters used at runtime/prediction | |
max_sequence_length: 200 | |
#max_sequence_length: 300 | |
batch_size: 20 | |
- name: "context" | |
engine: "delft" | |
delft: | |
#architecture: "gru" | |
#embeddings_name: "glove-840B" | |
architecture: "bert" | |
transformer: "michiyasunaga/LinkBERT-basecased" | |
- name: "context_used" | |
engine: "delft" | |
delft: | |
#architecture: "gru" | |
#embeddings_name: "glove-840B" | |
architecture: "bert" | |
transformer: "michiyasunaga/LinkBERT-basecased" | |
- name: "context_creation" | |
engine: "delft" | |
delft: | |
#architecture: "gru" | |
#embeddings_name: "glove-840B" | |
architecture: "bert" | |
transformer: "michiyasunaga/LinkBERT-basecased" | |
- name: "context_shared" | |
engine: "delft" | |
delft: | |
#architecture: "gru" | |
#embeddings_name: "glove-840B" | |
architecture: "bert" | |
transformer: "michiyasunaga/LinkBERT-basecased" | |
# Limit the maximum number of requests (0, no limit) | |
maxParallelRequests: 0 | |
# CORS configuration for the web API service | |
corsAllowedOrigins: "*" | |
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD" | |
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin" | |
server: | |
type: custom | |
idleTimeout: 120 seconds | |
applicationConnectors: | |
- type: http | |
port: 8060 | |
adminConnectors: | |
- type: http | |
port: 8061 | |
registerDefaultExceptionMappers: false | |
maxThreads: 2048 | |
maxQueuedRequests: 2048 | |
acceptQueueSize: 2048 | |
requestLog: | |
appenders: [] | |
# these logging settings apply to the service usage mode | |
logging: | |
level: INFO | |
loggers: | |
org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" | |
org.glassfish.jersey.internal: "OFF" | |
com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" | |
appenders: | |
- type: console | |
threshold: INFO | |
timeZone: UTC | |
# uncomment to have the logs in json format | |
#layout: | |
# type: json |