IO-3515 resolve issues on search selects not updating, improve confidence scoring.

This commit is contained in:
Patrick Fic
2026-02-19 12:22:35 -08:00
parent 5d53d09af9
commit ae1408012f
11 changed files with 410 additions and 26552 deletions

View File

@@ -2,6 +2,8 @@
const Fuse = require('fuse.js');
const { has } = require("lodash");
const { standardizedFieldsnames } = require('./bill-ocr-normalize');
const InstanceManager = require("../../utils/instanceMgr").default;
const PRICE_PERCENT_MARGIN_TOLERANCE = 0.5; //Used to make sure prices and costs are likely.
@@ -13,11 +15,97 @@ const normalizePartNumber = (str) => {
const normalizeText = (str) => {
return str.replace(/[^a-zA-Z0-9\s]/g, '').replace(/\s+/g, ' ').trim().toUpperCase();
};
const normalizePrice = (str) => {
if (typeof str !== 'string') return str;
return str.replace(/[^0-9.-]+/g, "");
};
const normalizePriceFinal = (str) => {
if (typeof str !== 'string') {
// If it's already a number, format to 2 decimals
const num = parseFloat(str);
return isNaN(num) ? 0 : num;
}
// First, try to extract valid decimal number patterns (e.g., "123.45")
const decimalPattern = /\d+\.\d{1,2}/g;
const decimalMatches = str.match(decimalPattern);
if (decimalMatches && decimalMatches.length > 0) {
// Found valid decimal number(s)
const numbers = decimalMatches.map(m => parseFloat(m)).filter(n => !isNaN(n) && n > 0);
if (numbers.length === 1) {
return numbers[0];
}
if (numbers.length > 1) {
// Check if all numbers are the same (e.g., "47.57.47.57" -> [47.57, 47.57])
const uniqueNumbers = [...new Set(numbers)];
if (uniqueNumbers.length === 1) {
return uniqueNumbers[0];
}
// Check if numbers are very close (within 1% tolerance)
const avg = numbers.reduce((a, b) => a + b, 0) / numbers.length;
const allClose = numbers.every(num => Math.abs(num - avg) / avg < 0.01);
if (allClose) {
return avg;
}
// Return the first number (most likely correct)
return numbers[0];
}
}
// Fallback: Split on common delimiters and extract all potential numbers
const parts = str.split(/[\/|\\,;]/).map(part => part.trim()).filter(part => part.length > 0);
if (parts.length > 1) {
// Multiple values detected - extract and parse all valid numbers
const numbers = parts
.map(part => {
const cleaned = part.replace(/[^0-9.-]+/g, "");
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? null : parsed;
})
.filter(num => num !== null && num > 0);
if (numbers.length === 0) {
// No valid numbers found, try fallback to basic cleaning
const cleaned = str.replace(/[^0-9.-]+/g, "");
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? 0 : parsed;
}
if (numbers.length === 1) {
return numbers[0];
}
// Multiple valid numbers
const uniqueNumbers = [...new Set(numbers)];
if (uniqueNumbers.length === 1) {
return uniqueNumbers[0];
}
// Check if numbers are very close (within 1% tolerance)
const avg = numbers.reduce((a, b) => a + b, 0) / numbers.length;
const allClose = numbers.every(num => Math.abs(num - avg) / avg < 0.01);
if (allClose) {
return avg;
}
// Return the first valid number
return numbers[0];
}
// Single value or no delimiters, clean normally
const cleaned = str.replace(/[^0-9.-]+/g, "");
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? 0 : parsed;
};
// Helper function to calculate Textract OCR confidence (0-100%)
const calculateTextractConfidence = (textractLineItem) => {
@@ -38,6 +126,11 @@ const calculateTextractConfidence = (textractLineItem) => {
return 0;
}
// Check if critical normalized labels are present
const hasActualCost = Object.values(textractLineItem).some(field => field.normalizedLabel === standardizedFieldsnames.actual_cost);
const hasActualPrice = Object.values(textractLineItem).some(field => field.normalizedLabel === standardizedFieldsnames.actual_price);
const hasLineDesc = Object.values(textractLineItem).some(field => field.normalizedLabel === standardizedFieldsnames.line_desc);
// Calculate weighted average, giving more weight to important fields
// If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher
let totalWeight = 0;
@@ -47,18 +140,42 @@ const calculateTextractConfidence = (textractLineItem) => {
if (field.confidence && typeof field.confidence === 'number') {
// Weight important fields higher
let weight = 1;
if (key === 'ITEM' || key === 'PRODUCT_CODE') {
weight = 2; // Description and part number are most important
} else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
weight = 1.5; // Price and quantity moderately important
if (field.normalizedLabel === standardizedFieldsnames.actual_cost || field.normalizedLabel === standardizedFieldsnames.actual_price) {
weight = 4;
}
else if (field.normalizedLabel === standardizedFieldsnames.part_no || field.normalizedLabel === standardizedFieldsnames.line_desc) {
weight = 3.5;
}
else if (field.normalizedLabel === standardizedFieldsnames.quantity) {
weight = 3.5;
}
// else if (key === 'ITEM' || key === 'PRODUCT_CODE') {
// weight = 3; // Description and part number are most important
// } else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
// weight = 2; // Price and quantity moderately important
// }
weightedSum += field.confidence * weight;
totalWeight += weight;
}
});
const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
let avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
// Apply penalty if critical normalized labels are missing
let missingFieldsPenalty = 1.0;
let missingCount = 0;
if (!hasActualCost) missingCount++;
if (!hasActualPrice) missingCount++;
if (!hasLineDesc) missingCount++;
// Each missing field reduces confidence by 15%
if (missingCount > 0) {
missingFieldsPenalty = 1.0 - (missingCount * 0.15);
}
avgConfidence = avgConfidence * missingFieldsPenalty;
return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places
};
@@ -109,9 +226,9 @@ const calculateOverallConfidence = (ocrConfidence, matchConfidence) => {
// Overall confidence is affected by both how well Textract read the data
// and how well we matched it to existing joblines
// Use a weighted average: 40% OCR confidence, 60% match confidence
// Match confidence is more important because even perfect OCR is useless without a good match
const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6);
// Use a weighted average: 60% OCR confidence, 40% match confidence
// OCR confidence is more important because even perfect match is useless without good OCR
const overall = (ocrConfidence * 0.6) + (matchConfidence * 0.4);
return Math.round(overall * 100) / 100;
};
@@ -147,61 +264,63 @@ const mergeResults = (resultsArray, weights = []) => {
.slice(0, 5); // Return top 5 results
};
async function generateBillFormData({ processedData, jobid, bodyshopid, partsorderid, req }) {
async function generateBillFormData({ processedData, jobid: jobidFromProps, bodyshopid, partsorderid, req }) {
const client = req.userGraphQLClient;
//TODO: Add in vendor data.
let jobid = jobidFromProps;
//If no jobid, fetch it, and funnel it back.
if (!jobid || jobid === null || jobid === undefined || jobid === "" || jobid === "null" || jobid === "undefined") {
const ro_number = processedData.summary?.PO_NUMBER?.value || Object.values(processedData.summary).find(value => value.normalizedLabel === 'ro_number')?.value;
if (!ro_number) {
throw new Error("Could not find RO number in the extracted data to associate with the bill. Select an RO and try again.");
}
const { jobs } = await client.request(`
query QUERY_BILL_OCR_JOB_BY_RO($ro_number: String!) {
jobs(where: {ro_number: {_eq: $ro_number}}) {
id
}
}`, { ro_number });
if (jobs.length === 0) {
throw new Error("No job found for the detected RO/PO number.");
} else {
jobid = jobs[0].id;
}
}
const jobData = await client.request(`
query QUERY_BILL_OCR_DATA($jobid: uuid!, $partsorderid: uuid!) {
vendors{
query QUERY_BILL_OCR_DATA($jobid: uuid!) {
vendors {
id
name
}
jobs_by_pk(id: $jobid) {
id
bodyshop {
id
md_responsibility_centers
cdk_dealerid
pbs_serialnumber
rr_dealerid
}
jobs_by_pk(id: $jobid) {
id
bodyshop{
id
md_responsibility_centers
cdk_dealerid
pbs_serialnumber
rr_dealerid
}
joblines {
id
line_desc
removed
act_price
db_price
oem_partno
alt_partno
part_type
}
}
parts_orders_by_pk(id: $partsorderid) {
id
parts_order_lines {
id
line_desc
act_price
cost
jobline {
id
line_desc
act_price
oem_partno
alt_partno
part_type
}
}
}
joblines {
id
line_desc
removed
act_price
db_price
oem_partno
alt_partno
part_type
}
}
`, {
jobid, partsorderid // this may fail if null?
}
`, {
jobid, // TODO: Refactor back in parts orders
});
//TODO: Need to find a vendor ID. Create a fuse for it, and fuzzy search for it using the textract vendor info.
//Create fuses of line descriptions for matching.
const jobLineDescFuse = new Fuse(
jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })),
@@ -226,7 +345,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
},
{
name: 'oem_partno_normalized',
weight: 5
weight: 6
},
{
name: 'alt_partno_normalized',
@@ -238,7 +357,6 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
}
);
const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2));
const vendorFuse = new Fuse(
jobData.vendors,
@@ -250,13 +368,13 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
}
);
const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2));
const vendorMatches = vendorFuse.search(processedData.summary?.VENDOR_NAME?.value || processedData.summary?.NAME?.value);
let vendorid;
if (vendorMatches.length > 0) {
vendorid = vendorMatches[0].item.id;
}
const { jobs_by_pk: job, parts_orders_by_pk: partsOrder } = jobData;
const { jobs_by_pk: job } = jobData;
if (!job) {
throw new Error('Job not found for bill form data generation.');
}
@@ -344,9 +462,9 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
: null
: responsibilityCenters.defaults &&
(responsibilityCenters.defaults.costs[matchToUse?.item?.part_type] || null)
: null, //Needs to get set by client side.
"applicable_taxes": { //Not sure what to do with these?
"federal": false,
: null,
"applicable_taxes": {
"federal": InstanceManager({ imex: true, rome: false }),
"state": false,
"local": false
},
@@ -551,43 +669,43 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) {
})
// Output search statistics table
console.log('\n═══════════════════════════════════════════════════════════════════════');
console.log(' FUSE.JS SEARCH STATISTICS');
console.log('═══════════════════════════════════════════════════════════════════════\n');
// // Output search statistics table
// console.log('\n═══════════════════════════════════════════════════════════════════════');
// console.log(' FUSE.JS SEARCH STATISTICS');
// console.log('═══════════════════════════════════════════════════════════════════════\n');
searchStats.forEach(lineStat => {
console.log(`📄 Line Item #${lineStat.lineNumber}:`);
console.log('─'.repeat(75));
// searchStats.forEach(lineStat => {
// console.log(`📄 Line Item #${lineStat.lineNumber}:`);
// console.log('─'.repeat(75));
if (lineStat.searches.length > 0) {
const tableData = lineStat.searches.map(search => ({
'Search Type': search.type,
'Search Term': search.term.substring(0, 40) + (search.term.length > 40 ? '...' : ''),
'Results': search.results
}));
console.table(tableData);
} else {
console.log(' No searches performed for this line item.\n');
}
});
// if (lineStat.searches.length > 0) {
// const tableData = lineStat.searches.map(search => ({
// 'Search Type': search.type,
// 'Search Term': search.term.substring(0, 40) + (search.term.length > 40 ? '...' : ''),
// 'Results': search.results
// }));
// console.table(tableData);
// } else {
// console.log(' No searches performed for this line item.\n');
// }
// });
// Summary statistics
const totalSearches = searchStats.reduce((sum, stat) => sum + stat.searches.length, 0);
const totalResults = searchStats.reduce((sum, stat) =>
sum + stat.searches.reduce((s, search) => s + search.results, 0), 0);
const avgResultsPerSearch = totalSearches > 0 ? (totalResults / totalSearches).toFixed(2) : 0;
// // Summary statistics
// const totalSearches = searchStats.reduce((sum, stat) => sum + stat.searches.length, 0);
// const totalResults = searchStats.reduce((sum, stat) =>
// sum + stat.searches.reduce((s, search) => s + search.results, 0), 0);
// const avgResultsPerSearch = totalSearches > 0 ? (totalResults / totalSearches).toFixed(2) : 0;
console.log('═══════════════════════════════════════════════════════════════════════');
console.log(' SUMMARY');
console.log('═══════════════════════════════════════════════════════════════════════');
console.table({
'Total Line Items': processedData.lineItems.length,
'Total Searches Performed': totalSearches,
'Total Results Found': totalResults,
'Average Results per Search': avgResultsPerSearch
});
console.log('═══════════════════════════════════════════════════════════════════════\n');
// console.log('═══════════════════════════════════════════════════════════════════════');
// console.log(' SUMMARY');
// console.log('═══════════════════════════════════════════════════════════════════════');
// console.table({
// 'Total Line Items': processedData.lineItems.length,
// 'Total Searches Performed': totalSearches,
// 'Total Results Found': totalResults,
// 'Average Results per Search': avgResultsPerSearch
// });
// console.log('═══════════════════════════════════════════════════════════════════════\n');
return matches
}

View File

@@ -1,7 +1,6 @@
const PDFDocument = require('pdf-lib').PDFDocument;
const TEXTRACT_REDIS_PREFIX = "textract:"
const TEXTRACT_JOB_TTL = 3600;
const TEXTRACT_REDIS_PREFIX = `textract:${process.env?.NODE_ENV === "production" ? "PROD" : "TEST"}`
const TEXTRACT_JOB_TTL = 10 * 60;
/**

View File

@@ -6,6 +6,14 @@ function normalizeFieldName(fieldType) {
return fieldType;
}
const standardizedFieldsnames = {
actual_cost: "actual_cost",
actual_price: "actual_price",
line_desc: "line_desc",
quantity: "quantity",
part_no: "part_no",
ro_number: "ro_number",
}
function normalizeLabelName(labelText) {
if (!labelText) return '';
@@ -15,13 +23,7 @@ function normalizeLabelName(labelText) {
// Remove special characters and replace spaces with underscores
normalized = normalized.replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, '_');
const standardizedFieldsnames = {
actual_cost: "actual_cost",
actual_price: "actual_price",
line_desc: "line_desc",
quantity: "quantity",
part_no: "part_no"
}
// Common label normalizations
const labelMap = {
@@ -30,6 +32,9 @@ function normalizeLabelName(labelText) {
'sale_qty': standardizedFieldsnames.quantity,
'invoiced_qty': standardizedFieldsnames.quantity,
'qty_shipped': standardizedFieldsnames.quantity,
'quantity': standardizedFieldsnames.quantity,
'filled': standardizedFieldsnames.quantity,
'count': standardizedFieldsnames.quantity,
'quant': standardizedFieldsnames.quantity,
'desc': standardizedFieldsnames.line_desc,
'description': standardizedFieldsnames.line_desc,
@@ -48,7 +53,10 @@ function normalizeLabelName(labelText) {
'net': standardizedFieldsnames.actual_cost,
'selling_price': standardizedFieldsnames.actual_cost,
'net_price': standardizedFieldsnames.actual_cost,
'net_cost': standardizedFieldsnames.actual_cost
'net_cost': standardizedFieldsnames.actual_cost,
'po_no': standardizedFieldsnames.ro_number,
'customer_po_no': standardizedFieldsnames.ro_number,
'customer_po_no_': standardizedFieldsnames.ro_number
};
@@ -102,10 +110,6 @@ function processScanData(invoiceData) {
return processedItem;
})
// .filter(item => {
// // Filter out items with no description or with quantity <= 0
// return item.description && (!item.quantity || item.quantity > 0);
// });
return processed;
}
@@ -162,7 +166,7 @@ function extractInvoiceData(textractResponse) {
let normalizedField = normalizeFieldName(fieldType);
// Ensure uniqueness by appending a counter if the field already exists
if (item.hasOwnProperty(normalizedField)) {
if (Object.prototype.hasOwnProperty.call(item, normalizedField)) {
fieldNameCounts[normalizedField] = (fieldNameCounts[normalizedField] || 1) + 1;
normalizedField = `${normalizedField}_${fieldNameCounts[normalizedField]}`;
}
@@ -191,5 +195,6 @@ function extractInvoiceData(textractResponse) {
module.exports = {
extractInvoiceData,
processScanData
processScanData,
standardizedFieldsnames
}

View File

@@ -6,4 +6,5 @@ Required Infrastructure setup
TODO:
* Create a rome bucket for uploads, or move to the regular spot.
* How to implement this across environments.
* How to prevent polling for a job that may have errored.
* How to prevent polling for a job that may have errored.
* Handling of HEIC files on upload.

View File

@@ -62,27 +62,12 @@ async function handleBillOcr(request, response) {
// The uploaded file is available in request.file
const uploadedFile = request.file;
const { jobid, bodyshopid, partsorderid, skipTextract } = request.body;
if (skipTextract === 'true') {
console.log('Skipping Textract processing as per request');
response.status(200).send({
success: true,
status: 'COMPLETED',
data: await generateBillFormData({ processedData: null, jobid, bodyshopid, partsorderid, req: request }), //This is broken if the processedData is not overwritten in the function for testing.
message: 'Invoice processing completed'
});
return;
}
const { jobid, bodyshopid, partsorderid } = request.body;
try {
const fileType = getFileType(uploadedFile);
console.log(`Processing file type: ${fileType}`);
// Images are always processed synchronously (single page)
if (fileType === 'image') {
console.log('Image => 1 page, processing synchronously');
const processedData = await processSinglePageDocument(uploadedFile.buffer);
const billForm = await generateBillFormData({ processedData: processedData, jobid, bodyshopid, partsorderid, req: request });
response.status(200).send({
@@ -94,11 +79,9 @@ async function handleBillOcr(request, response) {
} else if (fileType === 'pdf') {
// Check the number of pages in the PDF
const pageCount = await getPdfPageCount(uploadedFile.buffer);
console.log(`PDF has ${pageCount} page(s)`);
if (pageCount === 1) {
// Process synchronously for single-page documents
console.log('PDF => 1 page, processing synchronously');
const processedData = await processSinglePageDocument(uploadedFile.buffer);
const billForm = await generateBillFormData({ processedData: processedData, jobid, bodyshopid, partsorderid, req: request });
//const billResult = await generateBillFormData({ result, });
@@ -110,12 +93,11 @@ async function handleBillOcr(request, response) {
});
} else {
// Start the Textract job (non-blocking) for multi-page documents
console.log('PDF => 2+ pages, processing asynchronously');
const jobInfo = await startTextractJob(uploadedFile.buffer, { jobid, bodyshopid, partsorderid });
response.status(202).send({
success: true,
jobId: jobInfo.jobId,
textractJobId: jobInfo.jobId,
message: 'Invoice processing started',
statusUrl: `/ai/bill-ocr/status/${jobInfo.jobId}`
});
@@ -136,17 +118,14 @@ async function handleBillOcr(request, response) {
}
async function handleBillOcrStatus(request, response) {
const { jobId: textractJobId } = request.params;
const { textractJobId } = request.params;
if (!textractJobId) {
console.log('No textractJobId found in params');
response.status(400).send({ error: 'Job ID is required' });
return;
}
console.log('Looking for job:', textractJobId);
const jobStatus = await getTextractJob({ redisPubClient, textractJobId });
console.log('Job status:', jobStatus);
if (!jobStatus) {
response.status(404).send({ error: 'Job not found' });
@@ -156,18 +135,17 @@ async function handleBillOcrStatus(request, response) {
if (jobStatus.status === 'COMPLETED') {
// Generate billForm on-demand if not already generated
let billForm = jobStatus.data?.billForm;
if (!billForm && jobStatus.context) {
try {
console.log('Generating bill form data on-demand...');
billForm = await generateBillFormData({
processedData: jobStatus.data,
billForm = await generateBillFormData({
processedData: jobStatus.data,
jobid: jobStatus.context.jobid,
bodyshopid: jobStatus.context.bodyshopid,
partsorderid: jobStatus.context.partsorderid,
req: request // Now we have request context!
});
// Cache the billForm back to Redis for future requests
await setTextractJob({
redisPubClient,
@@ -181,7 +159,6 @@ async function handleBillOcrStatus(request, response) {
}
});
} catch (error) {
console.error('Error generating bill form data:', error);
response.status(500).send({
status: 'COMPLETED',
error: 'Data processed but failed to generate bill form',
@@ -191,7 +168,7 @@ async function handleBillOcrStatus(request, response) {
return;
}
}
response.status(200).send({
status: 'COMPLETED',
data: {
@@ -211,9 +188,6 @@ async function handleBillOcrStatus(request, response) {
}
}
/**
* Process a single-page document synchronously using AnalyzeExpenseCommand
* @param {Buffer} pdfBuffer
@@ -238,6 +212,7 @@ async function processSinglePageDocument(pdfBuffer) {
async function startTextractJob(pdfBuffer, context = {}) {
// Upload PDF to S3 temporarily for Textract async processing
const { bodyshopid, jobid } = context;
const s3Bucket = process.env.AWS_AI_BUCKET;
const snsTopicArn = process.env.AWS_TEXTRACT_SNS_TOPIC_ARN;
const snsRoleArn = process.env.AWS_TEXTRACT_SNS_ROLE_ARN;
@@ -253,7 +228,7 @@ async function startTextractJob(pdfBuffer, context = {}) {
}
const uploadId = uuidv4();
const s3Key = `textract-temp/${uploadId}.pdf`; //TODO Update Keys structure to something better.
const s3Key = `textract-temp/${bodyshopid}/${jobid}/${uploadId}.pdf`; //TODO Update Keys structure to something better.
// Upload to S3
const uploadCommand = new PutObjectCommand({
@@ -319,7 +294,6 @@ async function processSQSMessages() {
}
try {
console.log('Polling SQS queue:', queueUrl);
const receiveCommand = new ReceiveMessageCommand({
QueueUrl: queueUrl,
MaxNumberOfMessages: 10,
@@ -328,13 +302,12 @@ async function processSQSMessages() {
});
const result = await sqsClient.send(receiveCommand);
console.log('SQS poll result:', result.Messages ? `${result.Messages.length} messages` : 'no messages');
if (result.Messages && result.Messages.length > 0) {
console.log('Processing', result.Messages.length, 'messages from SQS');
for (const message of result.Messages) {
try {
console.log("Processing message:", message);
//TODO: Add environment level filtering here.
await handleTextractNotification(message);
// Delete message after successful processing