Skip to content
This repository has been archived by the owner on Sep 2, 2024. It is now read-only.

Commit

Permalink
add hierarchy script
Browse files Browse the repository at this point in the history
  • Loading branch information
RichardSmithONS committed Mar 4, 2024
1 parent 65da7b3 commit c903d75
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ You will be required to register with OS and abide by the Data Exploration Licen
The data used by ONS to build the Elasticsearch indices is produced as follows:

Every 6 weeks we receive a set of AddressBase ‘Change Only Update’ (COU) files (the latest ‘Epoch’), which are applied to our previous existing AddressBase master tables to produce ‘full’ tables for the latest Epoch.
We run a script against the updated AddressBase data to produce an additional Hierarchy table.
We run a <a href="dbscripts/hierarchy_script.sql">script</a> against the updated AddressBase data to produce an additional Hierarchy table.

The spark job will run without a real Hierarchy table (the dummy one in the unit tests will suffice) but the "relatives" information will be blank on every document. This won't affect the automated matching process, rather the structure of hierarchical addresses helps with clerical resolution.

Expand Down
107 changes: 107 additions & 0 deletions dbscripts/hierarchy_script.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
-- =============================================
-- FileName: usp_ai_hierarchy.sql
-- Description:
-- This Procedure create hierachy table with current records
--
-- =============================================
ALTER PROCEDURE [SCHEMA_NAME].[usp_ai_hierarchy]
AS


SET NOCOUNT ON;

TRUNCATE TABLE [DATABASE_NAME].[SCHEMA_NAME].[temp_hier_blpu_parent_child] ;

INSERT INTO [DATABASE_NAME].[SCHEMA_NAME].[temp_hier_blpu_parent_child]
SELECT address_entry_id, detail_valid_from_date, detail_valid_to_date, parent_address_entry_id, address_entry_id as primary_address_entry_id, NULL as secondary_address_entry_id,
uprn, parent_uprn, uprn as primary_uprn, NULL as secondary_uprn, 1 as layers, 1 as this_layer
FROM rdmf_uat.rdmf.blpu m
WHERE current_record_flag=1 AND parent_address_entry_id IS NULL
UNION
SELECT address_entry_id, detail_valid_from_date, detail_valid_to_date, parent_address_entry_id, NULL as primary_address_entry_id, NULL as secondary_address_entry_id,
uprn, parent_uprn, NULL as primary_uprn, NULL as secondary_uprn, 0 as layers, 0 as this_layer
FROM rdmf_uat.rdmf.blpu m
WHERE current_record_flag=1 AND parent_address_entry_id IS NOT NULL;

TRUNCATE TABLE [DATABASE_NAME].[SCHEMA_NAME].[temp_hier_blpu_uprn_no_parent];

INSERT INTO [DATABASE_NAME].[SCHEMA_NAME].[temp_hier_blpu_uprn_no_parent]
SELECT m.address_entry_id, m.detail_valid_from_date, m.detail_valid_to_date, m.parent_address_entry_id, NULL as primary_address_entry_id, NULL as secondary_address_entry_id,
m.uprn, m.parent_uprn, NULL as primary_uprn, NULL as secondary_uprn, 1 as layers, 1 as this_layer
FROM
(SELECT address_entry_id, detail_valid_from_date, detail_valid_to_date, parent_address_entry_id, NULL as primary_address_entry_id, NULL as secondary_address_entry_id,
uprn, parent_uprn, NULL as primary_uprn, NULL as secondary_uprn, 0 as layers, 0 as this_layer
FROM rdmf_uat.rdmf.blpu
WHERE current_record_flag=1 AND parent_address_entry_id IS NOT NULL) m
LEFT JOIN rdmf_uat.rdmf.blpu pm ON pm.current_record_flag=1 AND m.parent_uprn= pm.uprn
WHERE pm.uprn is NULL

TRUNCATE TABLE [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy];

WITH hier_tbl AS (

SELECT address_entry_id, hierarchy_valid_from_date, hierarchy_valid_to_date, parent_address_entry_id, address_entry_id as primary_address_entry_id, secondary_address_entry_id,
uprn, parent_uprn, uprn as primary_uprn, secondary_uprn, layers, this_layer
FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hier_blpu_parent_child]
WHERE parent_address_entry_id IS NULL
UNION ALL
SELECT address_entry_id, hierarchy_valid_from_date, hierarchy_valid_to_date, parent_address_entry_id, address_entry_id as primary_address_entry_id, secondary_address_entry_id,
uprn, parent_uprn, uprn as primary_uprn, secondary_uprn, layers, this_layer
FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hier_blpu_uprn_no_parent]
UNION ALL
SELECT m.address_entry_id, a.hierarchy_valid_from_date, m.hierarchy_valid_to_date, m.parent_address_entry_id, a.primary_address_entry_id,
CASE
WHEN (a.this_layer=1) THEN m.address_entry_id
ELSE a.secondary_address_entry_id
END as secondary_address_entry_id,
m.uprn, m.parent_uprn, a.primary_uprn,
CASE
WHEN (a.this_layer=1) THEN m.uprn
ELSE a.secondary_uprn
END as secondary_uprn,
a.layers + 1 as layers,
a.layers + 1 as this_layer
FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hier_blpu_parent_child] m
JOIN hier_tbl a
ON m.parent_address_entry_id = a.address_entry_id
)

INSERT INTO [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy]
SELECT address_entry_id, hierarchy_valid_from_date, hierarchy_valid_to_date, parent_address_entry_id, primary_address_entry_id, secondary_address_entry_id,
uprn, parent_uprn, primary_uprn, secondary_uprn, layers, this_layer
FROM hier_tbl;


UPDATE hr
SET layers = hr2.max_layer
FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy] hr
INNER JOIN (SELECT primary_address_entry_id,MAX(this_layer) OVER (PARTITION BY primary_address_entry_id) AS max_layer
FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy] ) hr2
ON hr.primary_address_entry_id = hr2.primary_address_entry_id ;


UPDATE tbh
SET secondary_address_entry_id=(
CASE
WHEN (se2.primary_uprn IS NULL) THEN tbh.secondary_address_entry_id
ELSE se2.secondary_address_entry_id
END ),
secondary_uprn= (CASE
WHEN (se2.primary_uprn IS NULL) THEN tbh.secondary_uprn
ELSE se2.secondary_uprn
END)
FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy] tbh
LEFT JOIN (SELECT se.primary_uprn, tbh.secondary_uprn, tbh.secondary_address_entry_id FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy] tbh
INNER JOIN (
SELECT primary_uprn, this_layer FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy] WHERE this_layer=2
GROUP BY primary_uprn, this_layer
HAVING count(*)=1) se ON tbh.primary_uprn=se.primary_uprn AND tbh.this_layer=se.this_layer) se2
ON tbh.primary_uprn=se2.primary_uprn AND tbh.this_layer=1;


TRUNCATE TABLE [rdmf_uat].[rdmf].[hierarchy];

INSERT INTO [rdmf_uat].[rdmf].[hierarchy]
SELECT address_entry_id, hierarchy_valid_from_date, hierarchy_valid_to_date, uprn, this_layer, layers, parent_uprn,
parent_address_entry_id, primary_uprn, primary_address_entry_id, secondary_uprn, secondary_address_entry_id
FROM [DATABASE_NAME].[SCHEMA_NAME].[temp_hierarchy];

0 comments on commit c903d75

Please sign in to comment.