From 739f2173c297b0afb33202b542ce04b6fa36a337 Mon Sep 17 00:00:00 2001 From: Peter Yates Date: Thu, 23 Jan 2025 17:30:56 +0000 Subject: [PATCH 1/5] Add super_diff gem for testing --- Gemfile | 1 + Gemfile.lock | 9 +++++++++ spec/rails_helper.rb | 1 + 3 files changed, 11 insertions(+) diff --git a/Gemfile b/Gemfile index c8e7c744..c8cffdcf 100644 --- a/Gemfile +++ b/Gemfile @@ -49,6 +49,7 @@ group :test do gem "rspec" gem "rspec-rails" gem "shoulda-matchers" + gem "super_diff" end group :development, :test do diff --git a/Gemfile.lock b/Gemfile.lock index 48fdf49b..2ee3bf7d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -97,6 +97,7 @@ GEM nokogiri asciidoctor (2.0.23) ast (2.4.2) + attr_extras (7.1.0) attr_required (1.0.2) base32 (0.3.4) base64 (0.2.0) @@ -359,6 +360,7 @@ GEM tzinfo validate_url webfinger (~> 2.0) + optimist (3.2.0) pagy (9.2.1) parallel (1.26.3) parser (3.3.7.0) @@ -366,6 +368,8 @@ GEM racc pastel (0.8.0) tty-color (~> 0.5) + patience_diff (1.2.0) + optimist (~> 3.0) pg (1.5.9) playwright-ruby-client (1.49.0) concurrent-ruby (>= 1.1.6) @@ -536,6 +540,10 @@ GEM set (~> 1.0) stackprof (0.2.27) stringio (3.1.2) + super_diff (0.15.0) + attr_extras (>= 6.2.4) + diff-lcs + patience_diff swd (2.0.3) activesupport (>= 3) attr_required (>= 0.0.5) @@ -634,6 +642,7 @@ DEPENDENCIES shoulda-matchers solid_queue stackprof + super_diff tzinfo-data webrick diff --git a/spec/rails_helper.rb b/spec/rails_helper.rb index 5e69a297..ab7173fc 100644 --- a/spec/rails_helper.rb +++ b/spec/rails_helper.rb @@ -1,5 +1,6 @@ require 'ostruct' require 'spec_helper' +require "super_diff/rspec-rails" ENV['RAILS_ENV'] ||= 'test' require_relative '../config/environment' From 3e251c50e968ec44560dffbef18199afc767812d Mon Sep 17 00:00:00 2001 From: Peter Yates Date: Thu, 23 Jan 2025 17:32:46 +0000 Subject: [PATCH 2/5] Allow the rake task to take CSV location arguments --- lib/tasks/appropriate_body_import.rake | 32 +++++++++++++++++++++----- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/lib/tasks/appropriate_body_import.rake b/lib/tasks/appropriate_body_import.rake index a3201ec7..822f141b 100644 --- a/lib/tasks/appropriate_body_import.rake +++ b/lib/tasks/appropriate_body_import.rake @@ -1,19 +1,39 @@ +require 'optparse' + namespace :appropriate_body do desc "Import Appropriate Body data from the old AB Portal" task import: :environment do + appropriate_bodies_csv = 'db/samples/appropriate-body-portal/appropriate-body.csv' + teachers_csv = 'db/samples/appropriate-body-portal/teachers.csv' + induction_periods_csv = 'db/samples/appropriate-body-portal/induction-periods.csv' + + files = {} + + opts = OptionParser.new + opts.banner = "Usage: rake appropriate_body:import --appropriate-body /tmp/ab.csv --teacher /tmp/teacher.csv --induction-period /tmp/ip.csv" + opts.on("-a", "--appropriate-body ARG", String) { |val| files[:appropriate_bodies_csv] = val } + opts.on("-t", "--teacher ARG", String) { |val| files[:teachers_csv] = val } + opts.on("-i", "--induction-period ARG", String) { |val| files[:induction_periods_csv] = val } + args = opts.order!(ARGV) {} + opts.parse!(args) + + appropriate_bodies_csv = files[:appropriate_bodies_csv] if files[:appropriate_bodies_csv] + teachers_csv = files[:teachers_csv] if files[:teachers_csv] + induction_periods_csv = files[:induction_periods_csv] if files[:induction_periods_csv] + logger = Logger.new($stdout) logger.info "Checking files exist" - logger.info "Importing Appropriate Body records" - imported_abs = AppropriateBodies::Importers::AppropriateBodyImporter.new.import + logger.info "Importing Appropriate Body records from #{appropriate_bodies_csv}" + imported_abs = AppropriateBodies::Importers::AppropriateBodyImporter.new(appropriate_bodies_csv).import logger.info " #{imported_abs} Appropriate Body records imported ✅" - logger.info "Importing Teacher records" - imported_teachers, total_teachers = AppropriateBodies::Importers::TeacherImporter.new.import + logger.info "Importing Teacher records from #{teachers_csv}" + imported_teachers, total_teachers = AppropriateBodies::Importers::TeacherImporter.new(teachers_csv).import logger.info " #{imported_teachers} Teacher records imported out of #{total_teachers} ✅" - logger.info "Importing Induction Periods" - imported_induction_periods, total_induction_periods = AppropriateBodies::Importers::InductionPeriodImporter.new.import + logger.info "Importing Induction Periods from #{induction_periods_csv}" + imported_induction_periods, total_induction_periods = AppropriateBodies::Importers::InductionPeriodImporter.new(induction_periods_csv).import logger.info " #{imported_induction_periods} Induction period records imported out of #{total_induction_periods} ✅" end end From 05e701afa4ef743d82bfb54d7a77ab6b8a16029e Mon Sep 17 00:00:00 2001 From: Peter Yates Date: Thu, 23 Jan 2025 17:32:10 +0000 Subject: [PATCH 3/5] Add new importer class --- .../importers/appropriate_body_importer.rb | 50 ++- lib/appropriate_bodies/importers/importer.rb | 15 + .../importers/induction_period_importer.rb | 144 +++++---- .../importers/teacher_importer.rb | 65 ++-- .../induction_period_importer_spec.rb | 285 +++++++++++++++--- 5 files changed, 418 insertions(+), 141 deletions(-) create mode 100644 lib/appropriate_bodies/importers/importer.rb diff --git a/lib/appropriate_bodies/importers/appropriate_body_importer.rb b/lib/appropriate_bodies/importers/appropriate_body_importer.rb index 395b85e6..8842ac8a 100644 --- a/lib/appropriate_bodies/importers/appropriate_body_importer.rb +++ b/lib/appropriate_bodies/importers/appropriate_body_importer.rb @@ -2,22 +2,41 @@ module AppropriateBodies::Importers class AppropriateBodyImporter - def initialize(filename = Rails.root.join('db/samples/appropriate-body-portal/appropriate-body.csv')) - @csv = CSV.read(filename, headers: true) - end + IMPORT_ERROR_LOG = 'tmp/appropriate_body_import.log'.freeze - def import - AppropriateBody.transaction do - @csv.each do |row| - Rails.logger.debug("attempting to import row: #{row.to_h}") + Row = Struct.new(:legacy_id, :name, :dfe_sign_in_organisation_id, :local_authority_code, :establishment_number) - AppropriateBody.create!(**build(row)) - end - end + def initialize(filename) + @csv = CSV.read(filename, headers: true) + + File.open(IMPORT_ERROR_LOG, 'w') { |f| f.truncate(0) } + @import_error_log = Logger.new(IMPORT_ERROR_LOG, File::CREAT) + end - @csv.count + def rows + @csv.map { |row| Row.new(**build(row)) } end + # def import + # AppropriateBody.transaction do + # @csv.each.with_index(1) do |row, i| + # Rails.logger.debug("attempting to import row: #{row.to_h}") + # + # begin + # AppropriateBody.create!(**build(row)) + # rescue ActiveRecord::RecordInvalid => e + # @import_error_log.error "#########################" + # @import_error_log.error "Failed to import Appropriate Body" + # @import_error_log.error "Row number: #{i}" + # @import_error_log.error "Message: #{e.message}" + # @import_error_log.error "Row data: #{row}" + # end + # end + # end + # + # @csv.count + # end + private def build(row) @@ -67,8 +86,15 @@ def extract_local_authority_code_and_establishment_number(row) local_authority_code: local_authority_code[0..2], establishment_number: local_authority_code[4..8] } + when %r{\A\d{7}\z} + { + local_authority_code: local_authority_code[0..2], + establishment_number: local_authority_code[3..7] + } else - Rails.logger.debug("Can't import #{local_authority_code} from #{row}") + @import_error_log.error "#########################" + @import_error_log.error "Invalid local authority code" + @import_error_log.error "Value: #{local_authority_code}" {} end diff --git a/lib/appropriate_bodies/importers/importer.rb b/lib/appropriate_bodies/importers/importer.rb new file mode 100644 index 00000000..f50bd7a5 --- /dev/null +++ b/lib/appropriate_bodies/importers/importer.rb @@ -0,0 +1,15 @@ +module AppropriateBodies::Importers + class Importer + def initialize(appropriate_body_csv, teachers_csv, induction_period_csv) + # ab_importer_rows = AppropriateBodyImporter.new(appropriate_body_csv).rows + # teacher_importer_rows = TeacherImporter.new(teachers_csv).rows + induction_periods_grouped_by_trn = InductionPeriodImporter.new(induction_period_csv).periods_by_trn + + true + + binding.debugger + # get induction periods grouped by TRN + # + end + end +end diff --git a/lib/appropriate_bodies/importers/induction_period_importer.rb b/lib/appropriate_bodies/importers/induction_period_importer.rb index 5232658b..3394afe4 100644 --- a/lib/appropriate_bodies/importers/induction_period_importer.rb +++ b/lib/appropriate_bodies/importers/induction_period_importer.rb @@ -2,23 +2,30 @@ module AppropriateBodies::Importers class InductionPeriodImporter + IMPORT_ERROR_LOG = 'tmp/induction_period_import.log'.freeze LOGFILE = Rails.root.join("log/induction_period_import.log").freeze attr_accessor :csv, :data - Row = Struct.new(:appropriate_body_id, :started_on, :finished_on, :induction_programme, :number_of_terms, :trn, keyword_init: true) do + Row = Struct.new(:appropriate_body_id, :started_on, :finished_on, :induction_programme, :number_of_terms, :trn, :notes, keyword_init: true) do def range started_on...finished_on end - def to_hash - { appropriate_body_id:, started_on:, finished_on:, number_of_terms:, induction_programme: convert_induction_programme } - end - def length (finished_on || Time.zone.today) - started_on end + # used in notes + def to_h + { appropriate_body_id:, started_on:, finished_on:, induction_programme:, number_of_terms: } + end + + # used for comparisons in tests + def to_hash + { appropriate_body_id:, started_on:, finished_on:, number_of_terms:, induction_programme: convert_induction_programme } + end + private def convert_induction_programme @@ -30,67 +37,51 @@ def convert_induction_programme end end - def initialize(filename: Rails.root.join('db/samples/appropriate-body-portal/induction-periods.csv'), csv: nil) - File.delete(LOGFILE) if File.exist?(LOGFILE) - + def initialize(filename, csv: nil) @csv = csv || CSV.read(filename, headers: true) - data = @csv.map do |row| - logger.debug("attempting to import row: #{row.to_hash}") - - unless (appropriate_body_id = appropriate_bodies[row['appropriate_body_id']]) - logger.warn("No AB found for ID #{row['appropriate_body_id']}") - - next - end - - Row.new( - appropriate_body_id:, - started_on: extract_date(row['started_on']), - finished_on: extract_date(row['finished_on']), - induction_programme: row['induction_programme_choice'], - number_of_terms: row['number_of_terms'].to_i, - trn: row['trn'] - ) - end - - @data = data.compact + File.open(IMPORT_ERROR_LOG, 'w') { |f| f.truncate(0) } + @import_error_log = Logger.new(IMPORT_ERROR_LOG, File::CREAT) end - def import - count = 0 - - periods_by_trn.each do |trn, rows| - logger.debug("adding rows for #{trn}") - teacher_id = teachers[trn] - - InductionPeriod.transaction do - rows.map do |row| - InductionPeriod.create!(**row, teacher_id:) - count += 1 - end - end - end + def rows + @rows ||= @csv.map { |row| Row.new(**build(row)) } + end - [count, @csv.count] + def build(row) + { + appropriate_body_id: row['appropriate_body_id'], + started_on: extract_date(row['started_on']), + finished_on: extract_date(row['finished_on']), + induction_programme: row['induction_programme_choice'], + number_of_terms: row['number_of_terms'].to_i, + trn: row['trn'], + notes: [] + } end def periods_by_trn - @data - .reject { |ip| ip.finished_on && ip.started_on >= ip.finished_on } # FIXME: log these + rows + .reject { |ip| ip.started_on.nil? } + .reject { |ip| ip.started_on == Date.new(1, 1, 1) } + .reject { |ip| ip.finished_on && ip.started_on >= ip.finished_on } .group_by(&:trn) + .select { |_trn, periods| periods.any? { |p| p.finished_on.nil? } } .transform_values { |periods| periods.sort_by { |p| [p.started_on, p.length, p.appropriate_body_id] } } .each_with_object({}) do |(trn, rows), h| keep = [] rows.each do |current| keep << current and next if keep.empty? - keep << current and next if keep.none? { |sibling| current.range.overlap?(sibling.range) } + keep << current and next if keep.none? { |already_recorded| current.range.overlap?(already_recorded.range) } keep .select { |sibling| sibling.range.overlap?(current.range) } .each do |sibling| - if sibling.appropriate_body_id == current.appropriate_body_id + original_sibling = sibling.to_h + original_current = current.to_h + + if sibling.appropriate_body_id == current.appropriate_body_id && sibling.induction_programme == current.induction_programme case when sibling.range.cover?(current.range) # ┌─────────────────────────────┐ @@ -99,6 +90,12 @@ def periods_by_trn # ┌──────────────────────────────────────┐ # Sibling │ KEEP │ # └──────────────────────────────────────┘ + sibling.number_of_terms = [sibling.number_of_terms, current.number_of_terms].max + sibling.notes << { + heading: "Imported from DQT", + body: "DQT held 2 overlapping induction periods for this teacher/appropriate body combination. 1 was discarded", + data: { originals: [original_sibling, original_current], combined: sibling.to_h } + } next when current.range.cover?(sibling.range) # ┌──────────────────────────────────────┐ @@ -107,6 +104,12 @@ def periods_by_trn # ┌─────────────────────────────┐ # Sibling │ DISCARD │ # └─────────────────────────────┘ + current.number_of_terms = [sibling.number_of_terms, current.number_of_terms].max + current.notes << { + heading: "Imported from DQT", + body: "DQT held 2 overlapping induction periods for this teacher/appropriate body combination. 1 was discarded", + data: { originals: [original_sibling, original_current], combined: current.to_h } + } keep.delete(sibling) keep << current when sibling.range.cover?(current.started_on) && !sibling.range.cover?(current.finished_on) @@ -117,7 +120,13 @@ def periods_by_trn # ┌─────────────────────────────────────────┬───┐ # Sibling │ EXTEND │╳╳╳│ # └─────────────────────────────────────────┴───┘ + current.number_of_terms = [sibling.number_of_terms, current.number_of_terms].max sibling.finished_on = current.finished_on + sibling.notes << { + heading: "Imported from DQT", + body: "DQT held 2 overlapping induction periods for this teacher/appropriate body combination. 1 was extended to cover the full duration.", + data: { originals: [original_sibling, original_current], combined: sibling.to_h } + } when !sibling.range.cover?(current.started_on) && sibling.range.cover(current.finished_on) # ┌──────────────────────────────────────┐ # Current │ DISCARD │ @@ -126,10 +135,34 @@ def periods_by_trn # ┌─────┬──────────────────────────────────────┐ # Sibling │╳╳╳╳╳│ EXTEND │ # └─────┴──────────────────────────────────────┘ + sibling.number_of_terms = [sibling.number_of_terms, current.number_of_terms].max sibling.started_on = current.started_on + sibling.notes << { + heading: "Imported from DQT", + body: "DQT held 2 overlapping induction periods for this teacher/appropriate body combination. 1 was extended to cover the full duration.", + data: { originals: [original_sibling, original_current], combined: sibling.to_h } + } else fail end + elsif sibling.appropriate_body_id == current.appropriate_body_id && sibling.induction_programme != current.induction_programme + case + when sibling.range.cover?(current.started_on) && !sibling.range.cover?(current.finished_on) + # ┌─────────────────────────────────┐ + # Current │ KEEP │ + # └─────────────────────────────────┘ + # + # ┌─────────┬┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┐ + # Sibling │ SHRINK │ ┊ + # └─────────┴┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┘ + sibling.finished_on = current.started_on + sibling.notes << { + heading: "Imported from DQT", + body: "DQT held 2 overlapping induction periods for this teacher/appropriate body combination with different induction programmes. This record was cut off when the later one started to prevent overlaps.", + data: { originals: [original_sibling, original_current] } + } + keep << current + end else case when sibling.range.cover?(current.range) || current.range.cover?(sibling.range) @@ -148,6 +181,11 @@ def periods_by_trn # Sibling │ SHRINK │ ┊ # └─────────┴┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┘ sibling.finished_on = current.started_on + sibling.notes << { + heading: "Imported from DQT", + body: "DQT held 2 overlapping induction periods for this teacher with different appropriate bodies. This record was cut off when the later one started to prevent overlaps.", + data: { originals: [original_sibling, original_current] } + } keep << current end end @@ -170,18 +208,6 @@ def logger end end - def appropriate_bodies - @appropriate_bodies ||= AppropriateBody - .select(:id, :legacy_id) - .each_with_object({}) { |t, h| h[t.legacy_id] = t.id } - end - - def teachers - @teachers ||= Teacher - .select(:id, :trn) - .each_with_object({}) { |t, h| h[t.trn] = t.id } - end - def extract_date(datetime) return if datetime.blank? diff --git a/lib/appropriate_bodies/importers/teacher_importer.rb b/lib/appropriate_bodies/importers/teacher_importer.rb index 4f891df9..e9d6fbb5 100644 --- a/lib/appropriate_bodies/importers/teacher_importer.rb +++ b/lib/appropriate_bodies/importers/teacher_importer.rb @@ -2,40 +2,59 @@ module AppropriateBodies::Importers class TeacherImporter - def initialize(filename = Rails.root.join('db/samples/appropriate-body-portal/teachers.csv')) - @csv = CSV.read(filename, headers: true) - end - - def import - count = 0 - - Teacher.transaction do - @csv.each do |row| - Rails.logger.debug("attempting to import row: #{row.to_h}") + IMPORT_ERROR_LOG = 'tmp/teacher_import.log'.freeze - next if row.values_at('trn', 'first_name', 'last_name').any?(&:blank?) + Row = Struct.new(:trn, :trs_first_name, :trs_last_name) - Teacher.create!(**(build(row))).tap do |teacher| - if (induction_extension_terms = convert_extension(row)) && induction_extension_terms.positive? - Rails.logger.warn("Importing extension: #{induction_extension_terms}") - teacher.induction_extensions.create!(number_of_terms: induction_extension_terms) - end - end + def initialize(filename) + @csv = CSV.read(filename, headers: true) - count += 1 - end - end + File.open(IMPORT_ERROR_LOG, 'w') { |f| f.truncate(0) } + @import_error_log = Logger.new(IMPORT_ERROR_LOG, File::CREAT) + end - [count, @csv.count] + def rows + @csv.first(5000).map { |row| Row.new(**build(row)) } end + # def import + # count = 0 + # + # Teacher.transaction do + # @csv.each.with_index(1) do |row, i| + # Rails.logger.debug("attempting to import row: #{row.to_h}") + # + # next if row.values_at('trn', 'first_name', 'last_name').any?(&:blank?) + # + # begin + # Teacher.create!(**(build(row))).tap do |teacher| + # if (induction_extension_terms = convert_extension(row)) && induction_extension_terms.positive? + # Rails.logger.warn("Importing extension: #{induction_extension_terms}") + # teacher.induction_extensions.create!(number_of_terms: induction_extension_terms) + # end + # end + # rescue ActiveRecord::RecordInvalid => e + # @import_error_log.error "#########################" + # @import_error_log.error "Failed to import Teacher" + # @import_error_log.error "Row number: #{i}" + # @import_error_log.error "Message: #{e.message}" + # @import_error_log.error "Row data: #{row}" + # end + # + # count += 1 + # end + # end + # + # [count, @csv.count] + # end + private def build(row) { trn: row['trn'], - first_name: row['first_name'].strip, - last_name: row['last_name'].strip, + trs_first_name: row['first_name'].strip, + trs_last_name: row['last_name']&.strip, } end diff --git a/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb b/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb index 800f682c..690de623 100644 --- a/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb +++ b/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb @@ -19,23 +19,23 @@ end let(:sample_csv) { CSV.parse(sample_csv_data, headers: true) } - subject { AppropriateBodies::Importers::InductionPeriodImporter.new(csv: sample_csv) } + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } it 'converts csv rows to Row objects when initialized' do - expect(subject.data).to all(be_a(AppropriateBodies::Importers::InductionPeriodImporter::Row)) + expect(subject.rows).to all(be_a(AppropriateBodies::Importers::InductionPeriodImporter::Row)) end it 'converts all rows' do - expect(subject.data.size).to eql(4) + expect(subject.rows.size).to eql(4) end describe 'mapping induction programmes' do it 'converts names to codes properly' do mappings = { - subject.data.find { |r| r.appropriate_body_id == ab_1.id } => 'cip', - subject.data.find { |r| r.appropriate_body_id == ab_2.id } => 'diy', - subject.data.find { |r| r.appropriate_body_id == ab_3.id } => 'fip', - subject.data.find { |r| r.appropriate_body_id == ab_4.id } => 'fip' # defaults when missing + subject.rows.find { |r| r.appropriate_body_id == ab_1.legacy_id } => 'cip', + subject.rows.find { |r| r.appropriate_body_id == ab_2.legacy_id } => 'diy', + subject.rows.find { |r| r.appropriate_body_id == ab_3.legacy_id } => 'fip', + subject.rows.find { |r| r.appropriate_body_id == ab_4.legacy_id } => 'fip' # defaults when missing } mappings.each do |row, expected_induction_programme| expect(row.to_hash.fetch(:induction_programme)).to eql(expected_induction_programme) @@ -44,7 +44,7 @@ end describe 'rebuilding periods' do - context 'when an ECT has one induction period with one AB' do + context 'when an ECT has no open induction periods' do let(:sample_csv_data) do <<~CSV appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn @@ -52,7 +52,20 @@ CSV end - subject { AppropriateBodies::Importers::InductionPeriodImporter.new(csv: sample_csv) } + it 'skips them' do + expect(subject.periods_as_hashes_by_trn).not_to have_key('2600071') + end + end + + context 'when an ECT has one induction period with one AB' do + let(:sample_csv_data) do + <<~CSV + appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,,,3,2600071 + CSV + end + + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } it 'contains the original row untouched' do expect(subject.periods_as_hashes_by_trn).to eql( @@ -60,11 +73,12 @@ '2600071' => [ AppropriateBodies::Importers::InductionPeriodImporter::Row.new( started_on: Date.new(2012, 1, 1), - finished_on: Date.new(2012, 10, 31), + finished_on: nil, induction_programme: nil, - appropriate_body_id: ab_1.id, + appropriate_body_id: ab_1.legacy_id, trn: '2600071', - number_of_terms: 3 + number_of_terms: 3, + notes: [] ).to_hash ] } @@ -72,17 +86,18 @@ end end - context 'when an ECT has two induction period with one AB' do + context 'when an ECT has two induction periods that have the same programme type with one AB' do context 'and the first period ends after the second period has started' do let(:sample_csv_data) do <<~CSV appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn - 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,03/03/2012 00:00:00,Core Induction Programme,3,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,3,2600071 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,04/04/2012 00:00:00,Full Induction Programme,3,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2021 00:00:00,,Full Induction Programme,,2600071 CSV end - subject { AppropriateBodies::Importers::InductionPeriodImporter.new(csv: sample_csv) } + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } it 'combines the two periods so the new period has the earliest start date and the latest finish date' do expect(subject.periods_as_hashes_by_trn).to eql( @@ -91,10 +106,21 @@ AppropriateBodies::Importers::InductionPeriodImporter::Row.new( started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), - induction_programme: 'Core Induction Programme', - appropriate_body_id: ab_1.id, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 3, + notes: [] + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 2, 2), + finished_on: nil, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, trn: '2600071', - number_of_terms: 3 + number_of_terms: 0, + notes: [] ).to_hash ] } @@ -106,12 +132,13 @@ let(:sample_csv_data) do <<~CSV appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn - 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,04/04/2012 00:00:00,,3,2600071 - 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,03/03/2012 00:00:00,,3,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,04/04/2012 00:00:00,Full Induction Programme,3,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,4,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2021 00:00:00,,Full Induction Programme,,2600071 CSV end - subject { AppropriateBodies::Importers::InductionPeriodImporter.new(csv: sample_csv) } + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } it 'combines the two periods so the new period has the earliest start date and the latest finish date' do expect(subject.periods_as_hashes_by_trn).to eql( @@ -120,39 +147,66 @@ AppropriateBodies::Importers::InductionPeriodImporter::Row.new( started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), - induction_programme: nil, - appropriate_body_id: ab_1.id, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, trn: '2600071', - number_of_terms: 3 + number_of_terms: 4 + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 1, 1), + finished_on: nil, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 0 ).to_hash ] } ) end + + it 'keeps the higher number of terms' do + number_of_terms = subject.periods_as_hashes_by_trn.dig('2600071', 0, :number_of_terms) + + expect(number_of_terms).to be(4) + end end context 'and the first period contains the second' do let(:sample_csv_data) do <<~CSV appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn - 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,04/04/2012 00:00:00,,2,2600071 - 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,03/03/2012 00:00:00,,3,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,04/04/2012 00:00:00,Full Induction Programme,2,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,3,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2021 00:00:00,,Full Induction Programme,3,2600071 CSV end - subject { AppropriateBodies::Importers::InductionPeriodImporter.new(csv: sample_csv) } + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } - it 'only the first is kept' do + it 'keeps the dates from the first' do expect(subject.periods_as_hashes_by_trn).to eql( { '2600071' => [ AppropriateBodies::Importers::InductionPeriodImporter::Row.new( started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), - induction_programme: nil, - appropriate_body_id: ab_1.id, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, trn: '2600071', - number_of_terms: 2 + number_of_terms: 3, + notes: [] + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 2, 2), + finished_on: nil, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 3, + notes: [] ).to_hash ] } @@ -164,12 +218,13 @@ let(:sample_csv_data) do <<~CSV appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn - 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,03/03/2012 00:00:00,Core Induction Programme,1,2600071 - 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,04/04/2012 00:00:00,Full induction programme,4,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,1,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,04/04/2012 00:00:00,Full Induction Programme,4,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2021 00:00:00,,Full induction programme,0,2600071 CSV end - subject { AppropriateBodies::Importers::InductionPeriodImporter.new(csv: sample_csv) } + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } it 'only the second is kept' do expect(subject.periods_as_hashes_by_trn).to eql( @@ -179,9 +234,20 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.id, + appropriate_body_id: ab_1.legacy_id, trn: '2600071', - number_of_terms: 4 + number_of_terms: 4, + notes: [] + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 1, 1), + finished_on: nil, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 0, + notes: [] ).to_hash ] } @@ -190,13 +256,16 @@ end end - context 'when an ECT has two induction period with one AB' do + context 'when an ECT has two induction periods that have different programme types with one AB' do + xspecify 'when one contains the other' + context 'when the periods do not overlap' do let(:sample_csv_data) do <<~CSV appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn - 1ddf3e82-c1ae-e311-b8ed-005056822391,01/01/2012 00:00:00,03/03/2012 00:00:00,Core Induction Programme,4,3600071 - 025e61e7-ec32-eb11-a813-000d3a228dfc,03/03/2012 00:00:00,05/05/2012 00:00:00,Full Induction Programme,2,3600071 + #{ab_2.legacy_id},01/01/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,4,3600071 + #{ab_2.legacy_id},03/03/2012 00:00:00,05/05/2012 00:00:00,Core Induction Programme,2,3600071 + #{ab_2.legacy_id},03/03/2021 00:00:00,,Full Induction Programme,2,3600071 CSV end @@ -207,18 +276,130 @@ AppropriateBodies::Importers::InductionPeriodImporter::Row.new( started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 3, 3), + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_2.legacy_id, + trn: '2600071', + number_of_terms: 4, + notes: [] + ).to_hash, + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2012, 3, 3), + finished_on: Date.new(2012, 5, 5), induction_programme: 'Core Induction Programme', - appropriate_body_id: ab_2.id, + appropriate_body_id: ab_2.legacy_id, trn: '2600071', - number_of_terms: 4 + number_of_terms: 2, + notes: [] + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 3, 3), + finished_on: nil, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_2.legacy_id, + trn: '2600071', + number_of_terms: 2, + notes: [] + ).to_hash + ] + } + ) + end + end + + context 'when the periods overlap' do + let(:sample_csv_data) do + <<~CSV + appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn + #{ab_2.legacy_id},01/01/2012 00:00:00,05/05/2012 00:00:00,Full Induction Programme,4,3600071 + #{ab_2.legacy_id},04/04/2012 00:00:00,06/06/2012 00:00:00,Core Induction Programme,2,3600071 + #{ab_2.legacy_id},03/03/2021 00:00:00,,Full Induction Programme,2,3600071 + CSV + end + + it 'the earlier one is curtailed so it does not clash with the later one' do + expect(subject.periods_as_hashes_by_trn).to eql( + { + '3600071' => [ + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2012, 1, 1), + finished_on: Date.new(2012, 4, 4), + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_2.legacy_id, + trn: '2600071', + number_of_terms: 4, + notes: [] + ).to_hash, + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2012, 4, 4), + finished_on: Date.new(2012, 6, 6), + induction_programme: 'Core Induction Programme', + appropriate_body_id: ab_2.legacy_id, + trn: '2600071', + number_of_terms: 2, + notes: [] + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 3, 3), + finished_on: nil, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_2.legacy_id, + trn: '2600071', + number_of_terms: 2, + notes: [] + ).to_hash + ] + } + ) + end + end + end + + context 'when an ECT has two induction period with different ABs' do + xspecify 'when one contains the other' + + context 'when the periods do not overlap' do + let(:sample_csv_data) do + <<~CSV + appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn + 1ddf3e82-c1ae-e311-b8ed-005056822391,01/01/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,4,3600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,03/03/2012 00:00:00,05/05/2012 00:00:00,Full Induction Programme,2,3600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,03/03/2021 00:00:00,,Full Induction Programme,2,3600071 + CSV + end + + it 'both are kept' do + expect(subject.periods_as_hashes_by_trn).to eql( + { + '3600071' => [ + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2012, 1, 1), + finished_on: Date.new(2012, 3, 3), + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_2.legacy_id, + trn: '2600071', + number_of_terms: 4, + notes: [] ).to_hash, AppropriateBodies::Importers::InductionPeriodImporter::Row.new( started_on: Date.new(2012, 3, 3), finished_on: Date.new(2012, 5, 5), + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 2, + notes: [] + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 3, 3), + finished_on: nil, induction_programme: 'Full Induction Prorgramme', - appropriate_body_id: ab_1.id, + appropriate_body_id: ab_1.legacy_id, trn: '2600071', - number_of_terms: 2 + number_of_terms: 2, + notes: [] ).to_hash ] } @@ -231,7 +412,8 @@ <<~CSV appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn 1ddf3e82-c1ae-e311-b8ed-005056822391,01/01/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,1,3600071 - 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,05/05/2012 00:00:00,Core Induction Programme,4,3600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2012 00:00:00,05/05/2012 00:00:00,Full Induction Programme,4,3600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,02/02/2021 00:00:00,,Full Induction Programme,4,3600071 CSV end @@ -243,15 +425,24 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 2, 2), induction_programme: 'Full Induction Progamme', - appropriate_body_id: ab_2.id, + appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 1 ).to_hash, AppropriateBodies::Importers::InductionPeriodImporter::Row.new( started_on: Date.new(2012, 2, 2), finished_on: Date.new(2012, 5, 5), - induction_programme: 'Core Induction Programme', - appropriate_body_id: ab_1.id, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 4 + ).to_hash, + # unrelated ongoing record so induction periods are included by the import + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2021, 2, 2), + finished_on: nil, + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 4 ).to_hash From 950b8b02337a54a2c0704ab1d89b078c6984e4f2 Mon Sep 17 00:00:00 2001 From: Peter Yates Date: Mon, 27 Jan 2025 12:35:03 +0000 Subject: [PATCH 4/5] Add special cases for periods that start on the same day When one has an end date and number of terms it should be favoured over the other (ongoing one) that doesn't. --- .../importers/induction_period_importer.rb | 36 +++++++++++ .../induction_period_importer_spec.rb | 60 +++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/lib/appropriate_bodies/importers/induction_period_importer.rb b/lib/appropriate_bodies/importers/induction_period_importer.rb index 3394afe4..ffe4f4d6 100644 --- a/lib/appropriate_bodies/importers/induction_period_importer.rb +++ b/lib/appropriate_bodies/importers/induction_period_importer.rb @@ -16,6 +16,14 @@ def length (finished_on || Time.zone.today) - started_on end + def finished? + finished_on.present? && number_of_terms.present? + end + + def ongoing? + !finished? + end + # used in notes def to_h { appropriate_body_id:, started_on:, finished_on:, induction_programme:, number_of_terms: } @@ -83,6 +91,33 @@ def periods_by_trn if sibling.appropriate_body_id == current.appropriate_body_id && sibling.induction_programme == current.induction_programme case + when current.started_on == sibling.started_on && current.finished? && sibling.ongoing? + # ┌─────────────────────────────────────────┐ + # Current │ KEEP │ + # └─────────────────────────────────────────┘ + # ┌──────────────────────────────────────────────────────────> + # Sibling │ DISCARD + # └──────────────────────────────────────────────────────────> + current.notes << { + heading: "Imported from DQT", + body: "DQT held 2 induction periods for this teacher/appropriate body combination with the same start date. The ongoing one was discarded.", + data: { originals: [original_sibling, original_current], combined: current.to_h } + } + keep.delete(sibling) + keep << current + when current.started_on == sibling.started_on && sibling.finished? && current.ongoing? + # ┌──────────────────────────────────────────────────────────> + # Current │ DISCARD + # └──────────────────────────────────────────────────────────> + # ┌─────────────────────────────────────────┐ + # Sibling │ KEEP │ + # └─────────────────────────────────────────┘ + sibling.notes << { + heading: "Imported from DQT", + body: "DQT held 2 induction periods for this teacher/appropriate body combination with the same start date. The ongoing one was discarded.", + data: { originals: [original_sibling, original_current], combined: sibling.to_h } + } + next when sibling.range.cover?(current.range) # ┌─────────────────────────────┐ # Current │ DISCARD │ @@ -120,6 +155,7 @@ def periods_by_trn # ┌─────────────────────────────────────────┬───┐ # Sibling │ EXTEND │╳╳╳│ # └─────────────────────────────────────────┴───┘ + current.number_of_terms = [sibling.number_of_terms, current.number_of_terms].max sibling.finished_on = current.finished_on sibling.notes << { diff --git a/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb b/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb index 690de623..7ec3f084 100644 --- a/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb +++ b/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb @@ -128,6 +128,66 @@ end end + context 'and the first period has an end date and terms but the second does not' do + let(:sample_csv_data) do + <<~CSV + appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,3,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,,Full Induction Programme,,2600071 + CSV + end + + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } + + it 'it keeps the finished (first) period' do + expect(subject.periods_as_hashes_by_trn).to eql( + { + '2600071' => [ + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2012, 1, 1), + finished_on: Date.new(2012, 3, 3), + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 3, + notes: [] + ).to_hash, + ] + } + ) + end + end + + context 'and the second period has an end date and terms but the first does not' do + let(:sample_csv_data) do + <<~CSV + appropriate_body_id,started_on,finished_on,induction_programme_choice,number_of_terms,trn + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,,Full Induction Programme,,2600071 + 025e61e7-ec32-eb11-a813-000d3a228dfc,01/01/2012 00:00:00,03/03/2012 00:00:00,Full Induction Programme,3,2600071 + CSV + end + + subject { AppropriateBodies::Importers::InductionPeriodImporter.new(nil, csv: sample_csv) } + + it 'it keeps the finished (second) period' do + expect(subject.periods_as_hashes_by_trn).to eql( + { + '2600071' => [ + AppropriateBodies::Importers::InductionPeriodImporter::Row.new( + started_on: Date.new(2012, 1, 1), + finished_on: Date.new(2012, 3, 3), + induction_programme: 'Full Induction Programme', + appropriate_body_id: ab_1.legacy_id, + trn: '2600071', + number_of_terms: 3, + notes: [] + ).to_hash, + ] + } + ) + end + end + context 'and the second period ends after the first period has started' do let(:sample_csv_data) do <<~CSV From c3f8871cdc69f1bddc1b752e352752e53ed541da Mon Sep 17 00:00:00 2001 From: Peter Yates Date: Fri, 24 Jan 2025 16:48:36 +0000 Subject: [PATCH 5/5] Rework the import process This change makes the three importers (teacher, AB, IP) less independent and puts the AppropriateBodies::Importers::Importer class in charge. This means the three importers can depend on each other, which means we can make the process *much faster*. The raw teachers table has 1.8m records and parsing it into a Ruby CSV file is slow, and we only want ~50k of those records so it's a pointless exercise. Now we use the induction periods table to work out which teachers we need, and loop through the CSV file looking for the rows we want, before parsing. --- .../importers/appropriate_body_importer.rb | 35 +++----- lib/appropriate_bodies/importers/importer.rb | 59 +++++++++++-- .../importers/induction_period_importer.rb | 26 ++++-- .../importers/teacher_importer.rb | 82 ++++++++++--------- .../induction_period_importer_spec.rb | 54 ++++++------ 5 files changed, 153 insertions(+), 103 deletions(-) diff --git a/lib/appropriate_bodies/importers/appropriate_body_importer.rb b/lib/appropriate_bodies/importers/appropriate_body_importer.rb index 8842ac8a..8ffe2604 100644 --- a/lib/appropriate_bodies/importers/appropriate_body_importer.rb +++ b/lib/appropriate_bodies/importers/appropriate_body_importer.rb @@ -4,38 +4,27 @@ module AppropriateBodies::Importers class AppropriateBodyImporter IMPORT_ERROR_LOG = 'tmp/appropriate_body_import.log'.freeze - Row = Struct.new(:legacy_id, :name, :dfe_sign_in_organisation_id, :local_authority_code, :establishment_number) + Row = Struct.new(:legacy_id, :name, :dfe_sign_in_organisation_id, :local_authority_code, :establishment_number) do + def to_h + { name:, legacy_id:, establishment_number:, local_authority_code:, dfe_sign_in_organisation_id: SecureRandom.uuid } # FIXME: fix dfe_sign_in_organisation_id + end + end - def initialize(filename) + def initialize(filename, wanted_legacy_ids) @csv = CSV.read(filename, headers: true) + @wanted_legacy_ids = wanted_legacy_ids File.open(IMPORT_ERROR_LOG, 'w') { |f| f.truncate(0) } @import_error_log = Logger.new(IMPORT_ERROR_LOG, File::CREAT) end def rows - @csv.map { |row| Row.new(**build(row)) } - end + @csv.map { |row| + next unless row['id'].in?(@wanted_legacy_ids) - # def import - # AppropriateBody.transaction do - # @csv.each.with_index(1) do |row, i| - # Rails.logger.debug("attempting to import row: #{row.to_h}") - # - # begin - # AppropriateBody.create!(**build(row)) - # rescue ActiveRecord::RecordInvalid => e - # @import_error_log.error "#########################" - # @import_error_log.error "Failed to import Appropriate Body" - # @import_error_log.error "Row number: #{i}" - # @import_error_log.error "Message: #{e.message}" - # @import_error_log.error "Row data: #{row}" - # end - # end - # end - # - # @csv.count - # end + Row.new(**build(row)) + }.compact + end private diff --git a/lib/appropriate_bodies/importers/importer.rb b/lib/appropriate_bodies/importers/importer.rb index f50bd7a5..8f20338e 100644 --- a/lib/appropriate_bodies/importers/importer.rb +++ b/lib/appropriate_bodies/importers/importer.rb @@ -1,15 +1,64 @@ module AppropriateBodies::Importers class Importer + # rubocop:disable Rails/Output def initialize(appropriate_body_csv, teachers_csv, induction_period_csv) - # ab_importer_rows = AppropriateBodyImporter.new(appropriate_body_csv).rows - # teacher_importer_rows = TeacherImporter.new(teachers_csv).rows induction_periods_grouped_by_trn = InductionPeriodImporter.new(induction_period_csv).periods_by_trn - true + active_teachers = induction_periods_grouped_by_trn.keys + teacher_importer_rows = TeacherImporter.new(teachers_csv, active_teachers).rows + + active_abs = induction_periods_grouped_by_trn.flat_map { |_trn, ips| ips.map(&:legacy_appropriate_body_id) }.uniq + ab_importer_rows = AppropriateBodyImporter.new(appropriate_body_csv, active_abs).rows + + puts "Active appropriate bodies: #{active_abs.count}" + # FIXME: use insert_all! here + AppropriateBody.insert_all(ab_importer_rows.select { |r| r.legacy_id.in?(active_abs) }.map(&:to_h)) + puts "Appropriate bodies inserted: #{AppropriateBody.count}" + + puts "Active Teachers: #{teacher_importer_rows.count}" + # FIXME: use insert_all! here + Teacher.insert_all(teacher_importer_rows.map(&:to_h)) + puts "Teachers inserted: #{Teacher.count}" + + # TODO: insert induction periods + teacher_trn_to_id = Teacher.all.select(:id, :trn).each_with_object({}) do |t, h| + h[t[:trn]] = t[:id] + end + + ab_legacy_id_to_id = AppropriateBody.all.select(:id, :legacy_id).each_with_object({}) do |ab, h| + h[ab[:legacy_id]] = ab[:id] + end + + induction_period_rows = [] + + induction_periods_grouped_by_trn.each do |trn, induction_periods| + induction_periods.each do |ip| + begin + ip.teacher_id = teacher_trn_to_id.fetch(trn) + rescue KeyError + puts "No teacher found with trn: #{trn}" + next + end + + begin + ip.appropriate_body_id = ab_legacy_id_to_id.fetch(ip.legacy_appropriate_body_id) + rescue KeyError + puts "No appropriate body found with legacy_id: #{ip.legacy_appropriate_body_id}" + next + end + + induction_period_rows << ip + end + end + + # FIXME: use insert_all! here + InductionPeriod.insert_all(induction_period_rows.map(&:to_record)) + + # TODO: insert extensions + # TODO: insert events binding.debugger - # get induction periods grouped by TRN - # end + # rubocop:enable Rails/Output end end diff --git a/lib/appropriate_bodies/importers/induction_period_importer.rb b/lib/appropriate_bodies/importers/induction_period_importer.rb index ffe4f4d6..a1a02f03 100644 --- a/lib/appropriate_bodies/importers/induction_period_importer.rb +++ b/lib/appropriate_bodies/importers/induction_period_importer.rb @@ -7,7 +7,7 @@ class InductionPeriodImporter attr_accessor :csv, :data - Row = Struct.new(:appropriate_body_id, :started_on, :finished_on, :induction_programme, :number_of_terms, :trn, :notes, keyword_init: true) do + Row = Struct.new(:legacy_appropriate_body_id, :started_on, :finished_on, :induction_programme, :number_of_terms, :trn, :notes, :teacher_id, :appropriate_body_id, keyword_init: true) do def range started_on...finished_on end @@ -26,12 +26,16 @@ def ongoing? # used in notes def to_h - { appropriate_body_id:, started_on:, finished_on:, induction_programme:, number_of_terms: } + { appropriate_body_id: legacy_appropriate_body_id, started_on:, finished_on:, induction_programme:, number_of_terms: } end # used for comparisons in tests def to_hash - { appropriate_body_id:, started_on:, finished_on:, number_of_terms:, induction_programme: convert_induction_programme } + { appropriate_body_id:, started_on:, finished_on:, induction_programme: convert_induction_programme, number_of_terms: } + end + + def to_record + { appropriate_body_id:, started_on:, finished_on:, induction_programme: convert_induction_programme, number_of_terms:, teacher_id: } end private @@ -58,13 +62,15 @@ def rows def build(row) { - appropriate_body_id: row['appropriate_body_id'], + legacy_appropriate_body_id: row['appropriate_body_id'], started_on: extract_date(row['started_on']), finished_on: extract_date(row['finished_on']), induction_programme: row['induction_programme_choice'], number_of_terms: row['number_of_terms'].to_i, trn: row['trn'], - notes: [] + notes: [], + appropriate_body_id: nil, + teacher_id: nil } end @@ -89,7 +95,7 @@ def periods_by_trn original_sibling = sibling.to_h original_current = current.to_h - if sibling.appropriate_body_id == current.appropriate_body_id && sibling.induction_programme == current.induction_programme + if sibling.legacy_appropriate_body_id == current.legacy_appropriate_body_id && sibling.induction_programme == current.induction_programme case when current.started_on == sibling.started_on && current.finished? && sibling.ongoing? # ┌─────────────────────────────────────────┐ @@ -181,7 +187,7 @@ def periods_by_trn else fail end - elsif sibling.appropriate_body_id == current.appropriate_body_id && sibling.induction_programme != current.induction_programme + elsif sibling.legacy_appropriate_body_id == current.legacy_appropriate_body_id && sibling.induction_programme != current.induction_programme case when sibling.range.cover?(current.started_on) && !sibling.range.cover?(current.finished_on) # ┌─────────────────────────────────┐ @@ -199,10 +205,12 @@ def periods_by_trn } keep << current end - else + else # different appropriate bodies case + when sibling.started_on == current.started_on + # TODO: work out what to do here when sibling.range.cover?(current.range) || current.range.cover?(sibling.range) - # an indunction period from one AB entirely contains one from another, + # an induction period from one AB entirely contains one from another, # which do we keep? # # This might never happen in prod so let's ignore it for now diff --git a/lib/appropriate_bodies/importers/teacher_importer.rb b/lib/appropriate_bodies/importers/teacher_importer.rb index e9d6fbb5..e43eea52 100644 --- a/lib/appropriate_bodies/importers/teacher_importer.rb +++ b/lib/appropriate_bodies/importers/teacher_importer.rb @@ -4,57 +4,61 @@ module AppropriateBodies::Importers class TeacherImporter IMPORT_ERROR_LOG = 'tmp/teacher_import.log'.freeze - Row = Struct.new(:trn, :trs_first_name, :trs_last_name) + Row = Struct.new(:trn, :first_name, :last_name, :induction_status) do + def to_h + { + trn:, + trs_first_name: first_name_with_fallback, + trs_last_name: last_name_with_fallback, + } + end - def initialize(filename) - @csv = CSV.read(filename, headers: true) + def first_name_with_fallback + first_name || 'Unknown' + end - File.open(IMPORT_ERROR_LOG, 'w') { |f| f.truncate(0) } - @import_error_log = Logger.new(IMPORT_ERROR_LOG, File::CREAT) + def last_name_with_fallback + last_name || 'Unknown' + end end - def rows - @csv.first(5000).map { |row| Row.new(**build(row)) } + def initialize(filename, wanted_trns) + sorted_wanted_trns = wanted_trns.sort + + file = File.readlines(filename) + file.delete_at(0) + sorted_lines = file.sort + + wanted_lines = [] + + seek = sorted_wanted_trns.shift + + sorted_lines.each do |line| + next unless line.start_with?(seek) + + wanted_lines << line + + break if sorted_wanted_trns.empty? + + seek = sorted_wanted_trns.shift + end + + @csv = CSV.parse(wanted_lines.join, headers: %w[trn first_name last_name induction_status]) end - # def import - # count = 0 - # - # Teacher.transaction do - # @csv.each.with_index(1) do |row, i| - # Rails.logger.debug("attempting to import row: #{row.to_h}") - # - # next if row.values_at('trn', 'first_name', 'last_name').any?(&:blank?) - # - # begin - # Teacher.create!(**(build(row))).tap do |teacher| - # if (induction_extension_terms = convert_extension(row)) && induction_extension_terms.positive? - # Rails.logger.warn("Importing extension: #{induction_extension_terms}") - # teacher.induction_extensions.create!(number_of_terms: induction_extension_terms) - # end - # end - # rescue ActiveRecord::RecordInvalid => e - # @import_error_log.error "#########################" - # @import_error_log.error "Failed to import Teacher" - # @import_error_log.error "Row number: #{i}" - # @import_error_log.error "Message: #{e.message}" - # @import_error_log.error "Row data: #{row}" - # end - # - # count += 1 - # end - # end - # - # [count, @csv.count] - # end + def rows + @rows ||= @csv.map { |row| Row.new(**build(row)) } + end private def build(row) { trn: row['trn'], - trs_first_name: row['first_name'].strip, - trs_last_name: row['last_name']&.strip, + first_name: row['first_name']&.strip, + last_name: row['last_name']&.strip, + induction_status: row['induction_status'] + # FIXME: extensions } end diff --git a/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb b/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb index 7ec3f084..85cdc790 100644 --- a/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb +++ b/spec/lib/appropriate_bodies/importers/induction_period_importer_spec.rb @@ -32,10 +32,10 @@ describe 'mapping induction programmes' do it 'converts names to codes properly' do mappings = { - subject.rows.find { |r| r.appropriate_body_id == ab_1.legacy_id } => 'cip', - subject.rows.find { |r| r.appropriate_body_id == ab_2.legacy_id } => 'diy', - subject.rows.find { |r| r.appropriate_body_id == ab_3.legacy_id } => 'fip', - subject.rows.find { |r| r.appropriate_body_id == ab_4.legacy_id } => 'fip' # defaults when missing + subject.rows.find { |r| r.legacy_appropriate_body_id == ab_1.legacy_id } => 'cip', + subject.rows.find { |r| r.legacy_appropriate_body_id == ab_2.legacy_id } => 'diy', + subject.rows.find { |r| r.legacy_appropriate_body_id == ab_3.legacy_id } => 'fip', + subject.rows.find { |r| r.legacy_appropriate_body_id == ab_4.legacy_id } => 'fip' # defaults when missing } mappings.each do |row, expected_induction_programme| expect(row.to_hash.fetch(:induction_programme)).to eql(expected_induction_programme) @@ -75,7 +75,7 @@ started_on: Date.new(2012, 1, 1), finished_on: nil, induction_programme: nil, - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 3, notes: [] @@ -107,7 +107,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 3, notes: [] @@ -117,7 +117,7 @@ started_on: Date.new(2021, 2, 2), finished_on: nil, induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 0, notes: [] @@ -147,7 +147,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 3, 3), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 3, notes: [] @@ -177,7 +177,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 3, 3), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 3, notes: [] @@ -208,7 +208,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 4 ).to_hash, @@ -217,7 +217,7 @@ started_on: Date.new(2021, 1, 1), finished_on: nil, induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 0 ).to_hash @@ -253,7 +253,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 3, notes: [] @@ -263,7 +263,7 @@ started_on: Date.new(2021, 2, 2), finished_on: nil, induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 3, notes: [] @@ -294,7 +294,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 4, notes: [] @@ -304,7 +304,7 @@ started_on: Date.new(2021, 1, 1), finished_on: nil, induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 0, notes: [] @@ -337,7 +337,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 3, 3), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 4, notes: [] @@ -346,7 +346,7 @@ started_on: Date.new(2012, 3, 3), finished_on: Date.new(2012, 5, 5), induction_programme: 'Core Induction Programme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 2, notes: [] @@ -356,7 +356,7 @@ started_on: Date.new(2021, 3, 3), finished_on: nil, induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 2, notes: [] @@ -385,7 +385,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 4, 4), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 4, notes: [] @@ -394,7 +394,7 @@ started_on: Date.new(2012, 4, 4), finished_on: Date.new(2012, 6, 6), induction_programme: 'Core Induction Programme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 2, notes: [] @@ -404,7 +404,7 @@ started_on: Date.new(2021, 3, 3), finished_on: nil, induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 2, notes: [] @@ -437,7 +437,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 3, 3), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 4, notes: [] @@ -446,7 +446,7 @@ started_on: Date.new(2012, 3, 3), finished_on: Date.new(2012, 5, 5), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 2, notes: [] @@ -456,7 +456,7 @@ started_on: Date.new(2021, 3, 3), finished_on: nil, induction_programme: 'Full Induction Prorgramme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 2, notes: [] @@ -485,7 +485,7 @@ started_on: Date.new(2012, 1, 1), finished_on: Date.new(2012, 2, 2), induction_programme: 'Full Induction Progamme', - appropriate_body_id: ab_2.legacy_id, + legacy_appropriate_body_id: ab_2.legacy_id, trn: '2600071', number_of_terms: 1 ).to_hash, @@ -493,7 +493,7 @@ started_on: Date.new(2012, 2, 2), finished_on: Date.new(2012, 5, 5), induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 4 ).to_hash, @@ -502,7 +502,7 @@ started_on: Date.new(2021, 2, 2), finished_on: nil, induction_programme: 'Full Induction Programme', - appropriate_body_id: ab_1.legacy_id, + legacy_appropriate_body_id: ab_1.legacy_id, trn: '2600071', number_of_terms: 4 ).to_hash