Skip to content

Commit

Permalink
Rework the import process
Browse files Browse the repository at this point in the history
This change makes the three importers (teacher, AB, IP) less independent
and puts the AppropriateBodies::Importers::Importer class in charge.

This means the three importers can depend on each other, which means we
can make the process *much faster*.

The raw teachers table has 1.8m records and parsing it into a Ruby CSV
file is slow, and we only want ~50k of those records so it's a pointless
exercise. Now we use the induction periods table to work out which
teachers we need, and loop through the CSV file looking for the rows we
want, before parsing.
  • Loading branch information
peteryates committed Jan 29, 2025
1 parent 950b8b0 commit c3f8871
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 103 deletions.
35 changes: 12 additions & 23 deletions lib/appropriate_bodies/importers/appropriate_body_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,27 @@ module AppropriateBodies::Importers
class AppropriateBodyImporter
IMPORT_ERROR_LOG = 'tmp/appropriate_body_import.log'.freeze

Row = Struct.new(:legacy_id, :name, :dfe_sign_in_organisation_id, :local_authority_code, :establishment_number)
Row = Struct.new(:legacy_id, :name, :dfe_sign_in_organisation_id, :local_authority_code, :establishment_number) do
def to_h
{ name:, legacy_id:, establishment_number:, local_authority_code:, dfe_sign_in_organisation_id: SecureRandom.uuid } # FIXME: fix dfe_sign_in_organisation_id
end
end

def initialize(filename)
def initialize(filename, wanted_legacy_ids)
@csv = CSV.read(filename, headers: true)
@wanted_legacy_ids = wanted_legacy_ids

File.open(IMPORT_ERROR_LOG, 'w') { |f| f.truncate(0) }
@import_error_log = Logger.new(IMPORT_ERROR_LOG, File::CREAT)
end

def rows
@csv.map { |row| Row.new(**build(row)) }
end
@csv.map { |row|
next unless row['id'].in?(@wanted_legacy_ids)

# def import
# AppropriateBody.transaction do
# @csv.each.with_index(1) do |row, i|
# Rails.logger.debug("attempting to import row: #{row.to_h}")
#
# begin
# AppropriateBody.create!(**build(row))
# rescue ActiveRecord::RecordInvalid => e
# @import_error_log.error "#########################"
# @import_error_log.error "Failed to import Appropriate Body"
# @import_error_log.error "Row number: #{i}"
# @import_error_log.error "Message: #{e.message}"
# @import_error_log.error "Row data: #{row}"
# end
# end
# end
#
# @csv.count
# end
Row.new(**build(row))
}.compact
end

private

Expand Down
59 changes: 54 additions & 5 deletions lib/appropriate_bodies/importers/importer.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,64 @@
module AppropriateBodies::Importers
class Importer
# rubocop:disable Rails/Output
def initialize(appropriate_body_csv, teachers_csv, induction_period_csv)
# ab_importer_rows = AppropriateBodyImporter.new(appropriate_body_csv).rows
# teacher_importer_rows = TeacherImporter.new(teachers_csv).rows
induction_periods_grouped_by_trn = InductionPeriodImporter.new(induction_period_csv).periods_by_trn

true
active_teachers = induction_periods_grouped_by_trn.keys
teacher_importer_rows = TeacherImporter.new(teachers_csv, active_teachers).rows

active_abs = induction_periods_grouped_by_trn.flat_map { |_trn, ips| ips.map(&:legacy_appropriate_body_id) }.uniq
ab_importer_rows = AppropriateBodyImporter.new(appropriate_body_csv, active_abs).rows

puts "Active appropriate bodies: #{active_abs.count}"
# FIXME: use insert_all! here
AppropriateBody.insert_all(ab_importer_rows.select { |r| r.legacy_id.in?(active_abs) }.map(&:to_h))
puts "Appropriate bodies inserted: #{AppropriateBody.count}"

puts "Active Teachers: #{teacher_importer_rows.count}"
# FIXME: use insert_all! here
Teacher.insert_all(teacher_importer_rows.map(&:to_h))
puts "Teachers inserted: #{Teacher.count}"

# TODO: insert induction periods
teacher_trn_to_id = Teacher.all.select(:id, :trn).each_with_object({}) do |t, h|
h[t[:trn]] = t[:id]
end

ab_legacy_id_to_id = AppropriateBody.all.select(:id, :legacy_id).each_with_object({}) do |ab, h|
h[ab[:legacy_id]] = ab[:id]
end

induction_period_rows = []

induction_periods_grouped_by_trn.each do |trn, induction_periods|
induction_periods.each do |ip|
begin
ip.teacher_id = teacher_trn_to_id.fetch(trn)
rescue KeyError
puts "No teacher found with trn: #{trn}"
next
end

begin
ip.appropriate_body_id = ab_legacy_id_to_id.fetch(ip.legacy_appropriate_body_id)
rescue KeyError
puts "No appropriate body found with legacy_id: #{ip.legacy_appropriate_body_id}"
next
end

induction_period_rows << ip
end
end

# FIXME: use insert_all! here
InductionPeriod.insert_all(induction_period_rows.map(&:to_record))

# TODO: insert extensions
# TODO: insert events

binding.debugger
# get induction periods grouped by TRN
#
end
# rubocop:enable Rails/Output
end
end
26 changes: 17 additions & 9 deletions lib/appropriate_bodies/importers/induction_period_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class InductionPeriodImporter

attr_accessor :csv, :data

Row = Struct.new(:appropriate_body_id, :started_on, :finished_on, :induction_programme, :number_of_terms, :trn, :notes, keyword_init: true) do
Row = Struct.new(:legacy_appropriate_body_id, :started_on, :finished_on, :induction_programme, :number_of_terms, :trn, :notes, :teacher_id, :appropriate_body_id, keyword_init: true) do
def range
started_on...finished_on
end
Expand All @@ -26,12 +26,16 @@ def ongoing?

# used in notes
def to_h
{ appropriate_body_id:, started_on:, finished_on:, induction_programme:, number_of_terms: }
{ appropriate_body_id: legacy_appropriate_body_id, started_on:, finished_on:, induction_programme:, number_of_terms: }
end

# used for comparisons in tests
def to_hash
{ appropriate_body_id:, started_on:, finished_on:, number_of_terms:, induction_programme: convert_induction_programme }
{ appropriate_body_id:, started_on:, finished_on:, induction_programme: convert_induction_programme, number_of_terms: }
end

def to_record
{ appropriate_body_id:, started_on:, finished_on:, induction_programme: convert_induction_programme, number_of_terms:, teacher_id: }
end

private
Expand All @@ -58,13 +62,15 @@ def rows

def build(row)
{
appropriate_body_id: row['appropriate_body_id'],
legacy_appropriate_body_id: row['appropriate_body_id'],
started_on: extract_date(row['started_on']),
finished_on: extract_date(row['finished_on']),
induction_programme: row['induction_programme_choice'],
number_of_terms: row['number_of_terms'].to_i,
trn: row['trn'],
notes: []
notes: [],
appropriate_body_id: nil,
teacher_id: nil
}
end

Expand All @@ -89,7 +95,7 @@ def periods_by_trn
original_sibling = sibling.to_h
original_current = current.to_h

if sibling.appropriate_body_id == current.appropriate_body_id && sibling.induction_programme == current.induction_programme
if sibling.legacy_appropriate_body_id == current.legacy_appropriate_body_id && sibling.induction_programme == current.induction_programme
case
when current.started_on == sibling.started_on && current.finished? && sibling.ongoing?
# ┌─────────────────────────────────────────┐
Expand Down Expand Up @@ -181,7 +187,7 @@ def periods_by_trn
else
fail
end
elsif sibling.appropriate_body_id == current.appropriate_body_id && sibling.induction_programme != current.induction_programme
elsif sibling.legacy_appropriate_body_id == current.legacy_appropriate_body_id && sibling.induction_programme != current.induction_programme
case
when sibling.range.cover?(current.started_on) && !sibling.range.cover?(current.finished_on)
# ┌─────────────────────────────────┐
Expand All @@ -199,10 +205,12 @@ def periods_by_trn
}
keep << current
end
else
else # different appropriate bodies
case
when sibling.started_on == current.started_on
# TODO: work out what to do here
when sibling.range.cover?(current.range) || current.range.cover?(sibling.range)
# an indunction period from one AB entirely contains one from another,
# an induction period from one AB entirely contains one from another,
# which do we keep?
#
# This might never happen in prod so let's ignore it for now
Expand Down
82 changes: 43 additions & 39 deletions lib/appropriate_bodies/importers/teacher_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,57 +4,61 @@ module AppropriateBodies::Importers
class TeacherImporter
IMPORT_ERROR_LOG = 'tmp/teacher_import.log'.freeze

Row = Struct.new(:trn, :trs_first_name, :trs_last_name)
Row = Struct.new(:trn, :first_name, :last_name, :induction_status) do
def to_h
{
trn:,
trs_first_name: first_name_with_fallback,
trs_last_name: last_name_with_fallback,
}
end

def initialize(filename)
@csv = CSV.read(filename, headers: true)
def first_name_with_fallback
first_name || 'Unknown'
end

File.open(IMPORT_ERROR_LOG, 'w') { |f| f.truncate(0) }
@import_error_log = Logger.new(IMPORT_ERROR_LOG, File::CREAT)
def last_name_with_fallback
last_name || 'Unknown'
end
end

def rows
@csv.first(5000).map { |row| Row.new(**build(row)) }
def initialize(filename, wanted_trns)
sorted_wanted_trns = wanted_trns.sort

file = File.readlines(filename)
file.delete_at(0)
sorted_lines = file.sort

wanted_lines = []

seek = sorted_wanted_trns.shift

sorted_lines.each do |line|
next unless line.start_with?(seek)

wanted_lines << line

break if sorted_wanted_trns.empty?

seek = sorted_wanted_trns.shift
end

@csv = CSV.parse(wanted_lines.join, headers: %w[trn first_name last_name induction_status])
end

# def import
# count = 0
#
# Teacher.transaction do
# @csv.each.with_index(1) do |row, i|
# Rails.logger.debug("attempting to import row: #{row.to_h}")
#
# next if row.values_at('trn', 'first_name', 'last_name').any?(&:blank?)
#
# begin
# Teacher.create!(**(build(row))).tap do |teacher|
# if (induction_extension_terms = convert_extension(row)) && induction_extension_terms.positive?
# Rails.logger.warn("Importing extension: #{induction_extension_terms}")
# teacher.induction_extensions.create!(number_of_terms: induction_extension_terms)
# end
# end
# rescue ActiveRecord::RecordInvalid => e
# @import_error_log.error "#########################"
# @import_error_log.error "Failed to import Teacher"
# @import_error_log.error "Row number: #{i}"
# @import_error_log.error "Message: #{e.message}"
# @import_error_log.error "Row data: #{row}"
# end
#
# count += 1
# end
# end
#
# [count, @csv.count]
# end
def rows
@rows ||= @csv.map { |row| Row.new(**build(row)) }
end

private

def build(row)
{
trn: row['trn'],
trs_first_name: row['first_name'].strip,
trs_last_name: row['last_name']&.strip,
first_name: row['first_name']&.strip,
last_name: row['last_name']&.strip,
induction_status: row['induction_status']
# FIXME: extensions
}
end

Expand Down
Loading

0 comments on commit c3f8871

Please sign in to comment.