Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process AFP category codes on ingestion #180

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 70 additions & 2 deletions ingestion-lambda/src/categoryCodes.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import {processFingerpostAAPCategoryCodes, processFingerpostAPCategoryCodes} from './categoryCodes';
import {
processFingerpostAAPCategoryCodes,
processFingerpostAFPCategoryCodes,
processFingerpostAPCategoryCodes,
} from './categoryCodes';

describe('processFingerpostAPCategoryCodes', () => {
it('should return an empty array if provided with an empty array', () => {
Expand All @@ -9,6 +13,12 @@ describe('processFingerpostAPCategoryCodes', () => {
expect(processFingerpostAPCategoryCodes(['service:news'])).toEqual([]);
});

it('should strip out empty iptccat entries', () => {
expect(processFingerpostAPCategoryCodes(['iptccat:', 'iptccat:a'])).toEqual(
['apCat:a'],
);
});

it('should return simple codes labelled "iptccat" as simple "apCat" codes', () => {
expect(
processFingerpostAPCategoryCodes(['iptccat:a', 'iptccat:b']),
Expand Down Expand Up @@ -41,6 +51,7 @@ describe('processFingerpostAPCategoryCodes', () => {
' iptccat:c',
' service:news ',
'qCode:value ',
'iptccat: ',
]),
).toEqual(['apCat:a', 'apCat:c', 'qCode:value']);
});
Expand Down Expand Up @@ -71,7 +82,7 @@ describe('processFingerpostAAPCategoryCodes', () => {
'04007003+food',
'goods|04013002+food',
'and',
'medtop:20000049'
'medtop:20000049',
]),
).toEqual(['medtop:20000049', 'subj:04007003', 'subj:04013002']);
});
Expand All @@ -90,3 +101,60 @@ describe('processFingerpostAAPCategoryCodes', () => {
]);
});
});

describe('processFingerpostAFPCategoryCodes', () => {
it('should return an empty array if provided with an empty array', () => {
expect(processFingerpostAFPCategoryCodes([])).toEqual([]);
});
it('should strip out service codes', () => {
expect(processFingerpostAFPCategoryCodes(['service:news'])).toEqual([]);
});
it('should strip out empty iptccat entries', () => {
expect(
processFingerpostAFPCategoryCodes(['iptccat:', 'iptccat:a']),
).toEqual(['afpCat:a']);
});
it('should return simple codes labelled "iptc" as simple "afp" codes', () => {
expect(
processFingerpostAFPCategoryCodes(['iptccat:a', 'iptccat:b']),
).toEqual(['afpCat:a', 'afpCat:b']);
});
it('should expand category codes with multiple subcodes', () => {
expect(processFingerpostAFPCategoryCodes(['iptccat:c+d'])).toEqual([
'afpCat:c',
'afpCat:d',
]);
});
it('should pass other codes through untransformed', () => {
expect(processFingerpostAFPCategoryCodes(['qCode:value+value'])).toEqual([
'qCode:value+value',
]);
});

it('should remove empty strings', () => {
expect(
processFingerpostAFPCategoryCodes(['iptccat:a', '', 'iptccat:c']),
).toEqual(['afpCat:a', 'afpCat:c']);
});

it('should remove trailing and leading whitespace', () => {
expect(
processFingerpostAFPCategoryCodes([
'iptccat:a ',
' iptccat:c',
' service:news ',
'iptccat: ',
'qCode:value ',
]),
).toEqual(['afpCat:a', 'afpCat:c', 'qCode:value']);
});

it('should deduplicate category codes after stripping whitespace', () => {
expect(
processFingerpostAFPCategoryCodes([
'iptccat:ECO+SOC+ECO+SOC+ECO ',
' iptccat:ECO',
]),
).toEqual(['afpCat:ECO', 'afpCat:SOC']);
});
});
73 changes: 53 additions & 20 deletions ingestion-lambda/src/categoryCodes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,22 @@ function partition<T>(
return [first, second];
}

/**
* We receive AP codes from Fingerpost in the format `prefix:code1+code2+code3:code4+code5`.
* At the time of writing these are AP category codes, but mislabelled as `iptccat` codes.
* This function transforms the prefix, and splits the codes into individual category codes.
*/
function flattenCategoryCodes(categoryCodes: string): string[] {
const [prefix, ...codes] = categoryCodes.split(':');
return codes
.flatMap((_) => _.split('+'))
.map((code) => `${prefix?.trim() === 'iptccat' ? 'apCat' : prefix}:${code}`);
}

export function processFingerpostAPCategoryCodes(original: string[]): string[] {
/**
* We receive AP codes from Fingerpost in the format `prefix:code1+code2+code3:code4+code5`.
* At the time of writing these are AP category codes, but mislabelled as `iptccat` codes.
* This function transforms the prefix, and splits the codes into individual category codes.
*/
function flattenCategoryCodes(categoryCodes: string): string[] {
const [prefix, ...codes] = categoryCodes.split(':');
return codes
.flatMap((_) => _.split('+'))
.filter((_) => _.trim().length > 0)
.map(
(code) => `${prefix?.trim() === 'iptccat' ? 'apCat' : prefix}:${code}`,
);
}

const notServiceCodes = original.filter((_) => !_.includes('service:')); // we aren't interested in keeping the service codes here
const [categoryCodes, rest] = partition(notServiceCodes, (code) =>
code.includes('iptccat:'),
Expand All @@ -40,19 +43,49 @@ export function processFingerpostAPCategoryCodes(original: string[]): string[] {
return deduped;
}

export function processFingerpostAAPCategoryCodes(categoryCodes: string[]): string[] {
const allCategoryCodes = categoryCodes
.flatMap((categoryCode) => categoryCode.split('|'))
export function processFingerpostAAPCategoryCodes(
categoryCodes: string[],
): string[] {
const allCategoryCodes = categoryCodes.flatMap((categoryCode) =>
categoryCode.split('|'),
);

const mediaTopics = allCategoryCodes
.filter((_) => _.split(':').length > 1)
const mediaTopics = allCategoryCodes.filter((_) => _.split(':').length > 1);

const legacySubjectCodes = allCategoryCodes
.filter((_) => _.split('+').length > 1)
.map((categoryCode) => {
const [ code, _label ] = categoryCode.split('+');
return `subj:${code}`
const [code, _label] = categoryCode.split('+');
return `subj:${code}`;
});

return [...mediaTopics, ...legacySubjectCodes]
return [...mediaTopics, ...legacySubjectCodes];
}

// example input: "iptccat:HUM+SCI"
export function processFingerpostAFPCategoryCodes(
original: string[],
): string[] {
function flattenCategoryCodes(categoryCodes: string): string[] {
const [prefix, ...codes] = categoryCodes.split(':');
return codes
.flatMap((_) => _.split('+'))
.filter((_) => _.trim().length > 0)
Comment on lines +71 to +73
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The part is duplicated; it might be good to move it to a function:

codes
    .flatMap((_) => _.split('+'))
    .filter((_) => _.trim().length > 0)

.map(
(code) => `${prefix?.trim() === 'iptccat' ? 'afpCat' : prefix}:${code}`,
);
}

const notServiceCodes = original.filter((_) => !_.includes('service:'));
const [categoryCodes, rest] = partition(notServiceCodes, (code) =>
code.includes('iptccat:'),
);

const transformedCategoryCodes = categoryCodes.flatMap(flattenCategoryCodes);
const allCategoryCodes = [...transformedCategoryCodes, ...rest]
.map((_) => _.trim())
.filter((_) => _.length > 0);
const deduped = [...new Set(allCategoryCodes)];

return deduped;
}
28 changes: 18 additions & 10 deletions ingestion-lambda/src/handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ import { createLogger } from '../../shared/lambda-logging';
import { createDbConnection } from '../../shared/rds';
import type { IngestorInputBody } from '../../shared/types';
import { IngestorInputBodySchema } from '../../shared/types';
import {processFingerpostAAPCategoryCodes, processFingerpostAPCategoryCodes} from './categoryCodes';
import {
processFingerpostAAPCategoryCodes,
processFingerpostAFPCategoryCodes,
processFingerpostAPCategoryCodes,
} from './categoryCodes';
import { tableName } from './database';
import { BUCKET_NAME, s3Client } from './s3';
import { lookupSupplier } from './suppliers';
Expand Down Expand Up @@ -52,20 +56,21 @@ export const processKeywords = (
return cleanAndDedupeKeywords(keywords.split('+'));
};

const processCategoryCodes = (supplier: string | undefined, subjectCodes: string[]) => {
const processCategoryCodes = (
supplier: string | undefined,
subjectCodes: string[],
) => {
switch (supplier) {
case 'AP':
return processFingerpostAPCategoryCodes(
subjectCodes,
);
return processFingerpostAPCategoryCodes(subjectCodes);
case 'AAP':
return processFingerpostAAPCategoryCodes(
subjectCodes,
);
return processFingerpostAAPCategoryCodes(subjectCodes);
case 'AFP':
return processFingerpostAFPCategoryCodes(subjectCodes);
default:
return [];
}
}
};

const safeBodyParse = (body: string): IngestorInputBody => {
try {
Expand Down Expand Up @@ -154,7 +159,10 @@ export const main = async (event: SQSEvent): Promise<SQSBatchResponse> => {

const supplier = lookupSupplier(snsMessageContent['source-feed']);

const categoryCodes = processCategoryCodes(supplier, snsMessageContent.subjects?.code ?? [])
const categoryCodes = processCategoryCodes(
supplier,
snsMessageContent.subjects?.code ?? [],
);

const result = await sql`
INSERT INTO ${sql(tableName)}
Expand Down
Loading