Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for extract pipelines with groups (AP-1764) #111

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
215 changes: 215 additions & 0 deletions config/service/marie-extract.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
jtype: Flow
version: '1'
protocol: grpc

# Shared configuration
shared_config:
storage: &storage
psql: &psql_conf_shared
provider: postgresql
hostname: 127.0.0.1
port: 5432
username: postgres
password: 123456
database: postgres
default_table: shared_docs

message: &message
amazon_mq : &amazon_mq_conf_shared
provider: amazon-rabbitmq
hostname: ${{ ENV.AWS_MQ_HOSTNAME }}
port: 15672
username: ${{ ENV.AWS_MQ_USERNAME }}
password: ${{ ENV.AWS_MQ_PASSWORD }}
tls: True
virtualhost: /


rabbitmq : &rabbitmq_conf_shared
provider: rabbitmq
hostname: ${{ ENV.RABBIT_MQ_HOSTNAME }}
port: ${{ ENV.RABBIT_MQ_PORT }}
username: ${{ ENV.RABBIT_MQ_USERNAME }}
password: ${{ ENV.RABBIT_MQ_PASSWORD }}
tls: False
virtualhost: /


# Toast event tracking system
# It can be backed by Message Queue and Database backed
toast:
native:
enabled: True
path: /tmp/marie/events.json
rabbitmq:
<<: *rabbitmq_conf_shared
enabled : True
psql:
<<: *psql_conf_shared
default_table: event_tracking
enabled : True

# Document Storage
# The storage service is used to store the data that is being processed
# Storage can be backed by S3 compatible

storage:
# S3 configuration. Will be used only if value of backend is "s3"
s3:
enabled: True
metadata_only: False # If True, only metadata will be stored in the storage backend
# api endpoint to connect to. use AWS S3 or any S3 compatible object storage endpoint.
endpoint_url: ${{ ENV.S3_ENDPOINT_URL }}
# optional.
# access key id when using static credentials.
access_key_id: ${{ ENV.S3_ACCESS_KEY_ID }}
# optional.
# secret key when using static credentials.
secret_access_key: ${{ ENV.S3_SECRET_ACCESS_KEY }}
# Bucket name in s3
bucket_name: ${{ ENV.S3_BUCKET_NAME }}
# optional.
# Example: "region: us-east-2"
region: ${{ ENV.S3_REGION }}
# optional.
# enable if endpoint is http
insecure: True
# optional.
# enable if you want to use path style requests
addressing_style: path

# postgresql configuration. Will be used only if value of backend is "psql"
psql:
<<: *psql_conf_shared
default_table: store_metadata
enabled : False

# Job Queue scheduler
scheduler:
psql:
<<: *psql_conf_shared
default_table: job_queue
enabled : True

# FLOW / GATEWAY configuration

with:
port:
- 51000
- 52000
protocol:
- http
- grpc
discovery: True
discovery_host: 127.0.0.1
discovery_port: 8500

host: 127.0.0.1

# monitoring
monitoring: true
port_monitoring: 57844

event_tracking: True

expose_endpoints:
/document/extract:
methods: ["POST"]
summary: Extract data-POC
tags:
- extract
/status:
methods: ["POST"]
summary: Status
tags:
- extract

/text/status:
methods: ["POST"]
summary: Extract data
tags:
- extract

/ner/extract:
methods: ["POST"]
summary: Extract NER
tags:
- ner

/document/classify:
methods: ["POST"]
summary: Classify document at page level
tags:
- classify

prefetch: 4

executors:
- name: extract_t
uses:
jtype: TextExtractionExecutor
# jtype: TextExtractionExecutorMock
with:
storage:
# postgresql configuration. Will be used only if value of backend is "psql"
psql:
<<: *psql_conf_shared
default_table: extract_metadata
enabled: True
pipelines:
- pipeline:
name: 'default'
default: True
page_classifier:
- model_name_or_path: 'marie/lmv3-medical-document-classification'
type: 'transformers'
task: 'text-classification-multimodal'
device: 'cuda'
enabled: True
batch_size: 1
name: 'medical_page_classifier'
group: 'medical-page-classifier'

- model_name_or_path: 'marie/lmv3-medical-document-payer'
type: 'transformers'
task: 'text-classification-multimodal'
enabled: True
batch_size: 1
device: 'cuda'
name: 'medical_payer_classifier'
group: 'medical-payer-classifier'

page_indexer:
- model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
enabled: True
type: 'transformers'
device: 'cuda'
name: 'page_indexer_patient'
group: 'medical-payer-classifier'
filter:
type: 'regex'
pattern: '.*'
page_splitter:
model_name_or_path: 'marie/layoutlmv3-medical-document-splitter'
enabled: True
metas:
py_modules:
- marie.executor.text
timeout_ready: 3000000
replicas: 1
# replicas: ${{ CONTEXT.gpu_device_count }}
env:
CUDA_VISIBLE_DEVICES: RR
# Authentication and Authorization configuration

auth:
keys:
- name : service-A
api_key : mas_0aPJ9Q9nUO1Ac1vJTfffXEXs9FyGLf9BzfYgZ_RaHm707wmbfHJNPQ
enabled : True
roles : [admin, user]

- name : service-B
api_key : mau_t6qDi1BcL1NkLI8I6iM8z1va0nZP01UQ6LWecpbDz6mbxWgIIIZPfQ
enabled : True
roles : [admin, user]
129 changes: 129 additions & 0 deletions config/tests-integration/pipeline-classify-004.partial.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
pipelines:
- pipeline:
name: 'default' # name of the pipeline, used for logging and asset saving
default: True
id2label:
'0': additional_information
'1': attorney
'2': auth_approval
'3': auth_denial
'4': bankruptcy
'5': cms_letter
'6': dispute
'7': eligibility
'8': medical_certificate
'9': medical_record
'10': misc
'11': newborn
'12': noop_blank
'13': noop_check
'14': noop_cover
'15': noop_envelope
'16': noop_eob
'17': noop_hicfa
'18': noop_notice
'19': noop_patpay
'20': noop_w9
'21': pa_162
'22': referral
'23': refund_request
'24': tax_1099

page_classifier:
- model_name_or_path: 'rms/corr-layoutlmv3-classifier'
name: 'corr_page_classifier_layoutlmv3'
type: 'transformers'
task: 'text-classification-multimodal'
device: 'cuda'
enabled: True
group: 'corr-classifier'

- model_name_or_path: 'rms/corr-longformer-classifier'
task: 'text-classification'
name: 'corr_page_classifier_longformer'
type: 'transformers'
enabled: True
batch_size: 1 # batch size > 1 causes errors due to wrong batch aggregation
device: 'cuda'
group: 'corr-classifier'
- model_name_or_path: 'rms/corr-layoutlmv3-classifier'
name: 'corr_page_classifier_layoutlmv3'
type: 'transformers'
task: 'text-classification-multimodal'
device: 'cuda'
enabled: True
group: 'jpmc-classifier'
# - model_name_or_path: 'rms/corr-payer-longformer-classifier'
# task: 'text-classification'
# name: 'corr_payer_longformer'
# type: 'transformers'
# enabled: True
# batch_size: 1 # batch size > 1 causes errors due to wrong batch aggregation
# device: 'cuda'
# group: 'corr-payer-classifier'

sub_classifier:
- model_name_or_path: 'rms/corr-auth-longformer-classifier'
task: 'text-classification'
name: 'corr_auth_sub_classifier'
type: 'transformers'
enabled: True
batch_size: 1 # batch size > 1 causes errors due to wrong batch aggregation
device: 'cuda'
group: 'corr-classifier'

id2label:
'0': auth_denial_in
'1': auth_denial_op
# Filter should be on the same level as the sub-classifier, for now this is just a global filter
filter:
type: 'exact'
pattern: 'auth_denial'
page_indexer:
- model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
enabled: True
type: 'transformers'
device: 'cuda'
name: 'page_indexer_patient'
filter:
type: 'regex'
pattern: '.*'
- pipeline:
default : false
name: 'jpmc-corr'
device: cuda
id2label:
'0': additional_information
'1': attorney
'2': auth_approval
'3': auth_denial
'4': bankruptcy
'5': cms_letter
'6': dispute
'7': eligibility
'8': medical_certificate
'9': medical_record
'10': misc
'11': newborn
'12': noop_blank
'13': noop_check
'14': noop_cover
'15': noop_envelope
'16': noop_eob
'17': noop_hicfa
'18': noop_notice
'19': noop_patpay
'20': noop_w9
'21': pa_162
'22': referral
'23': refund_request
'24': tax_1099

page_classifier:
- model_name_or_path: 'rms/corr-layoutlmv3-classifier'
name: 'corr_page_classifier_layoutlmv3'
type: 'transformers'
task: 'text-classification-multimodal'
device: 'cuda'
enabled: True
group: 'corr-classifier'
46 changes: 46 additions & 0 deletions config/tests-integration/pipeline-integration-001.partial.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
pipelines:
- pipeline:
name: 'default'
default: True
page_classifier:
- model_name_or_path: 'marie/lmv3-medical-document-classification'
type: 'transformers'
task: 'text-classification-multimodal'
device: 'cuda'
enabled: True
batch_size: 1
name: 'medical_page_classifier'
group: 'medical-classifier'

- model_name_or_path: 'marie/lmv3-medical-document-payer'
type: 'transformers'
task: 'text-classification-multimodal'
enabled: True
batch_size: 1
device: 'cuda'
name: 'medical_payer_classifier'
group: 'medical-payer-classifier'

page_indexer:
- model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
enabled: True
type: 'transformers'
device: 'cuda'
name: 'page_indexer_patient'
filter:
type: 'regex'
pattern: '.*'
group: 'medical-classifier'
- model_name_or_path: 'rms/layoutlmv3-large-corr-ner'
enabled: True
type: 'transformers'
device: 'cuda'
name: 'page_indexer_payer'
filter:
type: 'regex'
pattern: '.*'
group: 'medical-payer-classifier'

page_splitter:
model_name_or_path: 'marie/layoutlmv3-medical-document-splitter'
enabled: True
Loading