From 01b4944303c65313790428adba6a08518bae22f4 Mon Sep 17 00:00:00 2001 From: Pete F Date: Thu, 27 Feb 2025 12:55:05 +0000 Subject: [PATCH 1/2] Process AFP category codes on ingestion --- ingestion-lambda/src/categoryCodes.test.ts | 65 ++++++++++++++++++- ingestion-lambda/src/categoryCodes.ts | 72 ++++++++++++++++------ ingestion-lambda/src/handler.ts | 28 ++++++--- 3 files changed, 133 insertions(+), 32 deletions(-) diff --git a/ingestion-lambda/src/categoryCodes.test.ts b/ingestion-lambda/src/categoryCodes.test.ts index fcf07f1..613f3fa 100644 --- a/ingestion-lambda/src/categoryCodes.test.ts +++ b/ingestion-lambda/src/categoryCodes.test.ts @@ -1,4 +1,8 @@ -import {processFingerpostAAPCategoryCodes, processFingerpostAPCategoryCodes} from './categoryCodes'; +import { + processFingerpostAAPCategoryCodes, + processFingerpostAFPCategoryCodes, + processFingerpostAPCategoryCodes, +} from './categoryCodes'; describe('processFingerpostAPCategoryCodes', () => { it('should return an empty array if provided with an empty array', () => { @@ -71,7 +75,7 @@ describe('processFingerpostAAPCategoryCodes', () => { '04007003+food', 'goods|04013002+food', 'and', - 'medtop:20000049' + 'medtop:20000049', ]), ).toEqual(['medtop:20000049', 'subj:04007003', 'subj:04013002']); }); @@ -90,3 +94,60 @@ describe('processFingerpostAAPCategoryCodes', () => { ]); }); }); + +describe('processFingerpostAFPCategoryCodes', () => { + it('should return an empty array if provided with an empty array', () => { + expect(processFingerpostAFPCategoryCodes([])).toEqual([]); + }); + it('should strip out service codes', () => { + expect(processFingerpostAFPCategoryCodes(['service:news'])).toEqual([]); + }); + it('should strip out empty iptccat entries', () => { + expect( + processFingerpostAFPCategoryCodes(['iptccat:', 'iptccat:a']), + ).toEqual(['afpCat:a']); + }); + it('should return simple codes labelled "iptc" as simple "afp" codes', () => { + expect( + processFingerpostAFPCategoryCodes(['iptccat:a', 'iptccat:b']), + ).toEqual(['afpCat:a', 'afpCat:b']); + }); + it('should expand category codes with multiple subcodes', () => { + expect(processFingerpostAFPCategoryCodes(['iptccat:c+d'])).toEqual([ + 'afpCat:c', + 'afpCat:d', + ]); + }); + it('should pass other codes through untransformed', () => { + expect(processFingerpostAFPCategoryCodes(['qCode:value+value'])).toEqual([ + 'qCode:value+value', + ]); + }); + + it('should remove empty strings', () => { + expect( + processFingerpostAFPCategoryCodes(['iptccat:a', '', 'iptccat:c']), + ).toEqual(['afpCat:a', 'afpCat:c']); + }); + + it('should remove trailing and leading whitespace', () => { + expect( + processFingerpostAFPCategoryCodes([ + 'iptccat:a ', + ' iptccat:c', + ' service:news ', + 'iptccat: ', + 'qCode:value ', + ]), + ).toEqual(['afpCat:a', 'afpCat:c', 'qCode:value']); + }); + + it('should deduplicate category codes after stripping whitespace', () => { + expect( + processFingerpostAFPCategoryCodes([ + 'iptccat:ECO+SOC+ECO+SOC+ECO ', + ' iptccat:ECO', + ]), + ).toEqual(['afpCat:ECO', 'afpCat:SOC']); + }); +}); diff --git a/ingestion-lambda/src/categoryCodes.ts b/ingestion-lambda/src/categoryCodes.ts index aea2ff0..6511561 100644 --- a/ingestion-lambda/src/categoryCodes.ts +++ b/ingestion-lambda/src/categoryCodes.ts @@ -14,19 +14,21 @@ function partition( return [first, second]; } -/** - * We receive AP codes from Fingerpost in the format `prefix:code1+code2+code3:code4+code5`. - * At the time of writing these are AP category codes, but mislabelled as `iptccat` codes. - * This function transforms the prefix, and splits the codes into individual category codes. - */ -function flattenCategoryCodes(categoryCodes: string): string[] { - const [prefix, ...codes] = categoryCodes.split(':'); - return codes - .flatMap((_) => _.split('+')) - .map((code) => `${prefix?.trim() === 'iptccat' ? 'apCat' : prefix}:${code}`); -} - export function processFingerpostAPCategoryCodes(original: string[]): string[] { + /** + * We receive AP codes from Fingerpost in the format `prefix:code1+code2+code3:code4+code5`. + * At the time of writing these are AP category codes, but mislabelled as `iptccat` codes. + * This function transforms the prefix, and splits the codes into individual category codes. + */ + function flattenCategoryCodes(categoryCodes: string): string[] { + const [prefix, ...codes] = categoryCodes.split(':'); + return codes + .flatMap((_) => _.split('+')) + .map( + (code) => `${prefix?.trim() === 'iptccat' ? 'apCat' : prefix}:${code}`, + ); + } + const notServiceCodes = original.filter((_) => !_.includes('service:')); // we aren't interested in keeping the service codes here const [categoryCodes, rest] = partition(notServiceCodes, (code) => code.includes('iptccat:'), @@ -40,19 +42,49 @@ export function processFingerpostAPCategoryCodes(original: string[]): string[] { return deduped; } -export function processFingerpostAAPCategoryCodes(categoryCodes: string[]): string[] { - const allCategoryCodes = categoryCodes - .flatMap((categoryCode) => categoryCode.split('|')) +export function processFingerpostAAPCategoryCodes( + categoryCodes: string[], +): string[] { + const allCategoryCodes = categoryCodes.flatMap((categoryCode) => + categoryCode.split('|'), + ); - const mediaTopics = allCategoryCodes - .filter((_) => _.split(':').length > 1) + const mediaTopics = allCategoryCodes.filter((_) => _.split(':').length > 1); const legacySubjectCodes = allCategoryCodes .filter((_) => _.split('+').length > 1) .map((categoryCode) => { - const [ code, _label ] = categoryCode.split('+'); - return `subj:${code}` + const [code, _label] = categoryCode.split('+'); + return `subj:${code}`; }); - return [...mediaTopics, ...legacySubjectCodes] + return [...mediaTopics, ...legacySubjectCodes]; +} + +// example input: "iptccat:HUM+SCI" +export function processFingerpostAFPCategoryCodes( + original: string[], +): string[] { + function flattenCategoryCodes(categoryCodes: string): string[] { + const [prefix, ...codes] = categoryCodes.split(':'); + return codes + .flatMap((_) => _.split('+')) + .filter((_) => _.trim().length > 0) + .map( + (code) => `${prefix?.trim() === 'iptccat' ? 'afpCat' : prefix}:${code}`, + ); + } + + const notServiceCodes = original.filter((_) => !_.includes('service:')); + const [categoryCodes, rest] = partition(notServiceCodes, (code) => + code.includes('iptccat:'), + ); + + const transformedCategoryCodes = categoryCodes.flatMap(flattenCategoryCodes); + const allCategoryCodes = [...transformedCategoryCodes, ...rest] + .map((_) => _.trim()) + .filter((_) => _.length > 0); + const deduped = [...new Set(allCategoryCodes)]; + + return deduped; } diff --git a/ingestion-lambda/src/handler.ts b/ingestion-lambda/src/handler.ts index 734401a..d469bd6 100644 --- a/ingestion-lambda/src/handler.ts +++ b/ingestion-lambda/src/handler.ts @@ -8,7 +8,11 @@ import { createLogger } from '../../shared/lambda-logging'; import { createDbConnection } from '../../shared/rds'; import type { IngestorInputBody } from '../../shared/types'; import { IngestorInputBodySchema } from '../../shared/types'; -import {processFingerpostAAPCategoryCodes, processFingerpostAPCategoryCodes} from './categoryCodes'; +import { + processFingerpostAAPCategoryCodes, + processFingerpostAFPCategoryCodes, + processFingerpostAPCategoryCodes, +} from './categoryCodes'; import { tableName } from './database'; import { BUCKET_NAME, s3Client } from './s3'; import { lookupSupplier } from './suppliers'; @@ -52,20 +56,21 @@ export const processKeywords = ( return cleanAndDedupeKeywords(keywords.split('+')); }; -const processCategoryCodes = (supplier: string | undefined, subjectCodes: string[]) => { +const processCategoryCodes = ( + supplier: string | undefined, + subjectCodes: string[], +) => { switch (supplier) { case 'AP': - return processFingerpostAPCategoryCodes( - subjectCodes, - ); + return processFingerpostAPCategoryCodes(subjectCodes); case 'AAP': - return processFingerpostAAPCategoryCodes( - subjectCodes, - ); + return processFingerpostAAPCategoryCodes(subjectCodes); + case 'AFP': + return processFingerpostAFPCategoryCodes(subjectCodes); default: return []; } -} +}; const safeBodyParse = (body: string): IngestorInputBody => { try { @@ -154,7 +159,10 @@ export const main = async (event: SQSEvent): Promise => { const supplier = lookupSupplier(snsMessageContent['source-feed']); - const categoryCodes = processCategoryCodes(supplier, snsMessageContent.subjects?.code ?? []) + const categoryCodes = processCategoryCodes( + supplier, + snsMessageContent.subjects?.code ?? [], + ); const result = await sql` INSERT INTO ${sql(tableName)} From 0be015f19188271ff3b809869cac4a2a1aed6a8c Mon Sep 17 00:00:00 2001 From: Pete F Date: Thu, 27 Feb 2025 12:59:30 +0000 Subject: [PATCH 2/2] Fix: filter out empty catcodes for AP --- ingestion-lambda/src/categoryCodes.test.ts | 7 +++++++ ingestion-lambda/src/categoryCodes.ts | 1 + 2 files changed, 8 insertions(+) diff --git a/ingestion-lambda/src/categoryCodes.test.ts b/ingestion-lambda/src/categoryCodes.test.ts index 613f3fa..e5766c9 100644 --- a/ingestion-lambda/src/categoryCodes.test.ts +++ b/ingestion-lambda/src/categoryCodes.test.ts @@ -13,6 +13,12 @@ describe('processFingerpostAPCategoryCodes', () => { expect(processFingerpostAPCategoryCodes(['service:news'])).toEqual([]); }); + it('should strip out empty iptccat entries', () => { + expect(processFingerpostAPCategoryCodes(['iptccat:', 'iptccat:a'])).toEqual( + ['apCat:a'], + ); + }); + it('should return simple codes labelled "iptccat" as simple "apCat" codes', () => { expect( processFingerpostAPCategoryCodes(['iptccat:a', 'iptccat:b']), @@ -45,6 +51,7 @@ describe('processFingerpostAPCategoryCodes', () => { ' iptccat:c', ' service:news ', 'qCode:value ', + 'iptccat: ', ]), ).toEqual(['apCat:a', 'apCat:c', 'qCode:value']); }); diff --git a/ingestion-lambda/src/categoryCodes.ts b/ingestion-lambda/src/categoryCodes.ts index 6511561..9d9f216 100644 --- a/ingestion-lambda/src/categoryCodes.ts +++ b/ingestion-lambda/src/categoryCodes.ts @@ -24,6 +24,7 @@ export function processFingerpostAPCategoryCodes(original: string[]): string[] { const [prefix, ...codes] = categoryCodes.split(':'); return codes .flatMap((_) => _.split('+')) + .filter((_) => _.trim().length > 0) .map( (code) => `${prefix?.trim() === 'iptccat' ? 'apCat' : prefix}:${code}`, );