Skip to content

Commit

Permalink
Add Workflow Jobs for text processing and chunking
Browse files Browse the repository at this point in the history
- Implement ConvertToText, AnonymizeText, and CreateChunks jobs
- Update Workflow class to use new job classes
  • Loading branch information
gbp committed Sep 16, 2024
1 parent 16ad901 commit 2412db2
Show file tree
Hide file tree
Showing 13 changed files with 258 additions and 17 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ group :test, :development do
gem 'rspec-activemodel-mocks', '~> 1.2.0'
gem 'rspec-rails', '~> 7.0.1'
gem 'pry', '~> 0.14.2'
gem 'vcr', '~> 6.3.1'
end

group :development do
Expand Down
3 changes: 3 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,8 @@ GEM
unidecoder (1.1.2)
uniform_notifier (1.16.0)
uri (0.13.0)
vcr (6.3.1)
base64
vpim (24.2.20)
web-console (4.2.1)
actionview (>= 6.0.0)
Expand Down Expand Up @@ -697,6 +699,7 @@ DEPENDENCIES
uglifier (~> 4.2.0)
unicode (~> 0.4.4)
unidecoder (~> 1.1.0)
vcr (~> 6.3.1)
vpim (~> 24.2.20)
web-console (>= 3.3.0)
webmock (~> 3.23.1)
Expand Down
12 changes: 6 additions & 6 deletions app/models/workflow.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
# The Workflow class represents a sequence of jobs to be executed on a resource.
#
# Usage:
# workflow = Workflow.example(resource)
# workflow = Workflow.chunking(resource)
# workflow.run
#
# Class Methods:
# example(resource) - Creates a new example Workflow instance
# chunking(resource) - Creates a new Workflow instance for text chunking
#
class Workflow
def self.example(resource)
def self.chunking(resource)
Workflow.new(
resource: resource,
jobs: [
Workflow::Job,
Workflow::Job,
Workflow::Job
Workflow::Jobs::ConvertToText,
Workflow::Jobs::AnonymizeText,
Workflow::Jobs::CreateChunks
]
)
end
Expand Down
36 changes: 36 additions & 0 deletions app/models/workflow/jobs/anonymize_text.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# == Schema Information
# Schema version: 20240905062817
#
# Table name: workflow_jobs
#
# id :bigint not null, primary key
# type :string
# resource_type :string
# resource_id :bigint
# status :integer
# parent_id :bigint
# metadata :jsonb
# created_at :datetime not null
# updated_at :datetime not null
#

##
# This class represents a job for anonymizing text using an external command.
#
class Workflow::Jobs::AnonymizeText < Workflow::Job
def perform
file = Tempfile.new
file.write(source)
file.flush

cmd = [ENV['REDACT_COMMAND'], '--file', file.path].join(' ')
IO.popen(cmd, &:read)

ensure
file.close
end

def content_type
'text/plain'
end
end
30 changes: 30 additions & 0 deletions app/models/workflow/jobs/convert_to_text.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# == Schema Information
# Schema version: 20240905062817
#
# Table name: workflow_jobs
#
# id :bigint not null, primary key
# type :string
# resource_type :string
# resource_id :bigint
# status :integer
# parent_id :bigint
# metadata :jsonb
# created_at :datetime not null
# updated_at :datetime not null
#

##
# This class is responsible for converting HTML content to plain text.
#
class Workflow::Jobs::ConvertToText < Workflow::Job
include ActionView::Helpers::SanitizeHelper

def perform
strip_tags(sanitize(source))
end

def content_type
'text/plain'
end
end
36 changes: 36 additions & 0 deletions app/models/workflow/jobs/create_chunks.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# == Schema Information
# Schema version: 20240905062817
#
# Table name: workflow_jobs
#
# id :bigint not null, primary key
# type :string
# resource_type :string
# resource_id :bigint
# status :integer
# parent_id :bigint
# metadata :jsonb
# created_at :datetime not null
# updated_at :datetime not null
#

##
# This class represents a job in the workflow system that creates chunks for a resource.

Check warning on line 18 in app/models/workflow/jobs/create_chunks.rb

View workflow job for this annotation

GitHub Actions / build

[rubocop] reported by reviewdog 🐶 Line is too long. [88/80] (https://rubystyle.guide#max-line-length) Raw Output: app/models/workflow/jobs/create_chunks.rb:18:81: C: Layout/LineLength: Line is too long. [88/80] (https://rubystyle.guide#max-line-length)
#
class Workflow::Jobs::CreateChunks < Workflow::Job
after_destroy :destroy_chunks

def perform
resource.chunks.create!(text: source).to_gid
end

def content_type
'application/json'
end

private

def destroy_chunks
resource.chunks.destroy_all
end
end
12 changes: 12 additions & 0 deletions spec/factories/workflow_jobs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,17 @@
FactoryBot.define do
factory :workflow_job, class: 'Workflow::Job' do
resource { build(:foi_attachment) }

factory :convert_to_text, class: 'Workflow::Jobs::ConvertToText' do
source { '' }
end

factory :anonymize_text, class: 'Workflow::Jobs::AnonymizeText' do
source { '' }
end

factory :create_chunks, class: 'Workflow::Jobs::CreateChunks' do
source { '' }
end
end
end
33 changes: 33 additions & 0 deletions spec/fixtures/cassettes/test_chunk.yml

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions spec/models/workflow/jobs/anonymize_text_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
require 'spec_helper'

RSpec.describe Workflow::Jobs::AnonymizeText, type: :model do
let(:job) { FactoryBot.build(:anonymize_text) }

it 'inherits from Workflow::Job' do
expect(job).to be_a(Workflow::Job)
end

describe '#perform' do
it 'calls an external command to anonymize text' do
allow(IO).to receive(:popen).and_return('Anonymized text')
expect(job.perform).to eq('Anonymized text')
end
end

describe '#content_type' do
it 'returns the correct content type' do
expect(job.content_type).to eq('text/plain')
end
end
end
22 changes: 22 additions & 0 deletions spec/models/workflow/jobs/convert_to_text_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
require 'spec_helper'

RSpec.describe Workflow::Jobs::ConvertToText, type: :model do
let(:job) { FactoryBot.build(:convert_to_text) }

it 'inherits from Workflow::Job' do
expect(job).to be_a(Workflow::Job)
end

describe '#perform' do
it 'converts HTML to plain text' do
job.source = '<p>Hello <strong>World</strong></p>'
expect(job.perform).to eq('Hello World')
end
end

describe '#content_type' do
it 'returns the correct content type' do
expect(job.content_type).to eq('text/plain')
end
end
end
41 changes: 41 additions & 0 deletions spec/models/workflow/jobs/create_chunks_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
require 'spec_helper'

RSpec.describe Workflow::Jobs::CreateChunks, type: :model do
let(:job) { FactoryBot.build(:create_chunks) }

it 'inherits from Workflow::Job' do
expect(job).to be_a(Workflow::Job)
end

describe '#perform' do
it 'creates a chunk for the resource' do
VCR.use_cassette('test_chunk') do
resource = FactoryBot.create(:foi_attachment)
job.resource = resource
job.source = 'Test chunk'

expect { job.perform }.to change { resource.chunks.count }.by(1)
end
end
end

describe '#content_type' do
it 'returns the correct content type' do
expect(job.content_type).to eq('application/json')
end
end

describe 'callbacks' do
it 'destroys associated chunks when the job is destroyed' do
VCR.use_cassette('test_chunk') do
resource = FactoryBot.create(
:foi_attachment, chunks: [FactoryBot.build(:chunk)]
)
job.resource = resource
job.save!

expect { job.destroy }.to change { resource.chunks.count }.to(0)
end
end
end
end
22 changes: 11 additions & 11 deletions spec/models/workflow_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@
RSpec.describe Workflow do
let(:resource) { FactoryBot.build(:foi_attachment) }

describe '.example' do
it 'creates a new Workflow instance with example jobs' do
workflow = Workflow.example(resource)
describe '.chunking' do
it 'creates a new Workflow instance with chunking jobs' do
workflow = Workflow.chunking(resource)
expect(workflow).to be_a(Workflow)
expect(workflow.jobs.map(&:class)).to eq(
[
Workflow::Job,
Workflow::Job,
Workflow::Job
Workflow::Jobs::ConvertToText,
Workflow::Jobs::AnonymizeText,
Workflow::Jobs::CreateChunks
]
)
end
end

describe '#initialize' do
let(:jobs) { [Workflow::Job] }
let(:jobs) { [Workflow::Jobs::ConvertToText] }
let(:workflow) { Workflow.new(resource: resource, jobs: jobs) }

it 'sets the resource and jobs' do
Expand All @@ -28,7 +28,7 @@
end

describe '#run' do
let(:workflow) { Workflow.example(resource) }
let(:workflow) { Workflow.chunking(resource) }
let(:last_job) { workflow.jobs.last }

context 'when the last job is completed' do
Expand All @@ -51,8 +51,8 @@
end

describe '#run_job' do
let(:workflow) { Workflow.example(resource) }
let(:job_class) { WorkflowJob }
let(:workflow) { Workflow.chunking(resource) }
let(:job_class) { Workflow::Jobs::ConvertToText }

it 'queues, runs, and resets jobs as needed' do
initial_job = double('initial_job', pending!: true, run: true)
Expand All @@ -72,7 +72,7 @@
end

describe '#jobs' do
let(:workflow) { Workflow.example(resource) }
let(:workflow) { Workflow.chunking(resource) }

it 'returns an array of job instances' do
expect(workflow.jobs).to all(be_a(Workflow::Job))
Expand Down
5 changes: 5 additions & 0 deletions spec/support/vcr.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
VCR.configure do |config|
config.cassette_library_dir = "spec/fixtures/cassettes"
config.hook_into :webmock
config.configure_rspec_metadata!
end

0 comments on commit 2412db2

Please sign in to comment.