IO-3515 resolve issues on search selects not updating, improve confidence scoring.

This commit is contained in:
Patrick Fic
2026-02-19 12:22:35 -08:00
parent 5d53d09af9
commit ae1408012f
11 changed files with 410 additions and 26552 deletions

View File

@@ -2,6 +2,8 @@
const Fuse = require('fuse.js');
const { has } = require("lodash");
const { standardizedFieldsnames } = require('./bill-ocr-normalize');
const InstanceManager = require("../../utils/instanceMgr").default;
const PRICE_PERCENT_MARGIN_TOLERANCE = 0.5; //Used to make sure prices and costs are likely.
@@ -13,11 +15,97 @@ const normalizePartNumber = (str) => {
const normalizeText = (str) => {
return str.replace(/[^a-zA-Z0-9\s]/g, '').replace(/\s+/g, ' ').trim().toUpperCase();
};
const normalizePrice = (str) => {
if (typeof str !== 'string') return str;
return str.replace(/[^0-9.-]+/g, "");
};
const normalizePriceFinal = (str) => {
if (typeof str !== 'string') {
// If it's already a number, format to 2 decimals
const num = parseFloat(str);
return isNaN(num) ? 0 : num;
}
// First, try to extract valid decimal number patterns (e.g., "123.45")
const decimalPattern = /\d+\.\d{1,2}/g;
const decimalMatches = str.match(decimalPattern);
if (decimalMatches && decimalMatches.length > 0) {
// Found valid decimal number(s)
const numbers = decimalMatches.map(m => parseFloat(m)).filter(n => !isNaN(n) && n > 0);
if (numbers.length === 1) {
return numbers[0];
}
if (numbers.length > 1) {
// Check if all numbers are the same (e.g., "47.57.47.57" -> [47.57, 47.57])
const uniqueNumbers = [...new Set(numbers)];
if (uniqueNumbers.length === 1) {
return uniqueNumbers[0];
}
// Check if numbers are very close (within 1% tolerance)
const avg = numbers.reduce((a, b) => a + b, 0) / numbers.length;
const allClose = numbers.every(num => Math.abs(num - avg) / avg < 0.01);
if (allClose) {
return avg;
}
// Return the first number (most likely correct)
return numbers[0];
}
}
// Fallback: Split on common delimiters and extract all potential numbers
const parts = str.split(/[\/|\\,;]/).map(part => part.trim()).filter(part => part.length > 0);
if (parts.length > 1) {
// Multiple values detected - extract and parse all valid numbers
const numbers = parts
.map(part => {
const cleaned = part.replace(/[^0-9.-]+/g, "");
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? null : parsed;
})
.filter(num => num !== null && num > 0);
if (numbers.length === 0) {
// No valid numbers found, try fallback to basic cleaning
const cleaned = str.replace(/[^0-9.-]+/g, "");
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? 0 : parsed;
}
if (numbers.length === 1) {
return numbers[0];
}
// Multiple valid numbers
const uniqueNumbers = [...new Set(numbers)];
if (uniqueNumbers.length === 1) {
return uniqueNumbers[0];
}
// Check if numbers are very close (within 1% tolerance)
const avg = numbers.reduce((a, b) => a + b, 0) / numbers.length;
const allClose = numbers.every(num => Math.abs(num - avg) / avg < 0.01);
if (allClose) {
return avg;
}
// Return the first valid number
return numbers[0];
}
// Single value or no delimiters, clean normally
const cleaned = str.replace(/[^0-9.-]+/g, "");
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? 0 : parsed;
};
// Helper function to calculate Textract OCR confidence (0-100%)
const calculateTextractConfidence = (textractLineItem) => {
@@ -38,6 +126,11 @@ const calculateTextractConfidence = (textractLineItem) => {
return 0;
}
// Check if critical normalized labels are present
const hasActualCost = Object.values(textractLineItem).some(field => field.normalizedLabel === standardizedFieldsnames.actual_cost);
const hasActualPrice = Object.values(textractLineItem).some(field => field.normalizedLabel === standardizedFieldsnames.actual_price);
const hasLineDesc = Object.values(textractLineItem).some(field => field.normalizedLabel === standardizedFieldsnames.line_desc);
// Calculate weighted average, giving more weight to important fields
// If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher
let totalWeight = 0;
@@ -47,18 +140,42 @@ const calculateTextractConfidence = (textractLineItem) => {
if (field.confidence && typeof field.confidence === 'number') {
// Weight important fields higher
let weight = 1;
if (key === 'ITEM' || key === 'PRODUCT_CODE') {
weight = 2; // Description and part number are most important
} else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
weight = 1.5; // Price and quantity moderately important
if (field.normalizedLabel === standardizedFieldsnames.actual_cost || field.normalizedLabel === standardizedFieldsnames.actual_price) {
weight = 4;
}
else if (field.normalizedLabel === standardizedFieldsnames.part_no || field.normalizedLabel === standardizedFieldsnames.line_desc) {
weight = 3.5;
}
else if (field.normalizedLabel === standardizedFieldsnames.quantity) {
weight = 3.5;
}
// else if (key === 'ITEM' || key === 'PRODUCT_CODE') {
// weight = 3; // Description and part number are most important
// } else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
// weight = 2; // Price and quantity moderately important
// }
weightedSum += field.confidence * weight;
totalWeight += weight;
}
});
const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
let avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
// Apply penalty if critical normalized labels are missing
let missingFieldsPenalty = 1.0;
let missingCount = 0;
if (!hasActualCost) missingCount++;
if (!hasActualPrice) missingCount++;
if (!hasLineDesc) missingCount++;
// Each missing field reduces confidence by 15%
if (missingCount > 0) {
missingFieldsPenalty = 1.0 - (missingCount * 0.15);
}
avgConfidence = avgConfidence * missingFieldsPenalty;
return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places
};
@@ -109,9 +226,9 @@ const calculateOverallConfidence = (ocrConfidence, matchConfidence) => {
// Overall confidence is affected by both how well Textract read the data
// and how well we matched it to existing joblines
// Use a weighted average: 40% OCR confidence, 60% match confidence
// Match confidence is more important because even perfect OCR is useless without a good match
const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6);
// Use a weighted average: 60% OCR confidence, 40% match confidence
// OCR confidence is more important because even perfect match is useless without good OCR
const overall = (ocrConfidence * 0.6) + (matchConfidence * 0.4);
return Math.round(overall * 100) / 100;
};
@@ -147,61 +264,63 @@ const mergeResults = (resultsArray, weights = []) => {
.slice(0, 5); // Return top 5 results
};
async function generateBillFormData({ processedData, jobid, bodyshopid, partsorderid, req }) {
async function generateBillFormData({ processedData, jobid: jobidFromProps, bodyshopid, partsorderid, req }) {
const client = req.userGraphQLClient;
//TODO: Add in vendor data.
let jobid = jobidFromProps;
//If no jobid, fetch it, and funnel it back.
if (!jobid || jobid === null || jobid === undefined || jobid === "" || jobid === "null" || jobid === "undefined") {
const ro_number = processedData.summary?.PO_NUMBER?.value || Object.values(processedData.summary).find(value => value.normalizedLabel === 'ro_number')?.value;
if (!ro_number) {
throw new Error("Could not find RO number in the extracted data to associate with the bill. Select an RO and try again.");
}
const { jobs } = await client.request(`
query QUERY_BILL_OCR_JOB_BY_RO($ro_number: String!) {
jobs(where: {ro_number: {_eq: $ro_number}}) {
id
}
}`, { ro_number });
if (jobs.length === 0) {
throw new Error("No job found for the detected RO/PO number.");
} else {
jobid = jobs[0].id;
}
}
const jobData = await client.request(`
query QUERY_BILL_OCR_DATA($jobid: uuid!, $partsorderid: uuid!) {
vendors{
query QUERY_BILL_OCR_DATA($jobid: uuid!) {
vendors {
id
name
}
jobs_by_pk(id: $jobid) {
id
bodyshop {
id
md_responsibility_centers
cdk_dealerid
pbs_serialnumber
rr_dealerid
}
jobs_by_pk(id: $jobid) {
id
bodyshop{
id
md_responsibility_centers
cdk_dealerid
pbs_serialnumber
rr_dealerid
}
joblines {
id
line_desc
removed
act_price
db_price
oem_partno
alt_partno
part_type
}
}
parts_orders_by_pk(id: $partsorderid) {
id
parts_order_lines {
id
line_desc
act_price
cost
jobline {
id
line_desc
act_price
oem_partno
alt_partno
part_type
}
}
}
joblines {
id
line_desc
removed
act_price
db_price
oem_partno
alt_partno
part_type
}
}
`, {
jobid, partsorderid // this may fail if null?
}
`, {
jobid, // TODO: Refactor back in parts orders
});
//TODO: Need to find a vendor ID. Create a fuse for it, and fuzzy search for it using the textract vendor info.
//Create fuses of line descriptions for matching.
const jobLineDescFuse = new Fuse(
jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })),
@@ -226,7 +345,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
},
{
name: 'oem_partno_normalized',
weight: 5
weight: 6
},
{
name: 'alt_partno_normalized',
@@ -238,7 +357,6 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
}
);
const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2));
const vendorFuse = new Fuse(
jobData.vendors,
@@ -250,13 +368,13 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
}
);
const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2));
const vendorMatches = vendorFuse.search(processedData.summary?.VENDOR_NAME?.value || processedData.summary?.NAME?.value);
let vendorid;
if (vendorMatches.length > 0) {
vendorid = vendorMatches[0].item.id;
}
const { jobs_by_pk: job, parts_orders_by_pk: partsOrder } = jobData;
const { jobs_by_pk: job } = jobData;
if (!job) {
throw new Error('Job not found for bill form data generation.');
}
@@ -344,9 +462,9 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
: null
: responsibilityCenters.defaults &&
(responsibilityCenters.defaults.costs[matchToUse?.item?.part_type] || null)
: null, //Needs to get set by client side.
"applicable_taxes": { //Not sure what to do with these?
"federal": false,
: null,
"applicable_taxes": {
"federal": InstanceManager({ imex: true, rome: false }),
"state": false,
"local": false
},
@@ -551,43 +669,43 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) {
})
// Output search statistics table
console.log('\n═══════════════════════════════════════════════════════════════════════');
console.log(' FUSE.JS SEARCH STATISTICS');
console.log('═══════════════════════════════════════════════════════════════════════\n');
// // Output search statistics table
// console.log('\n═══════════════════════════════════════════════════════════════════════');
// console.log(' FUSE.JS SEARCH STATISTICS');
// console.log('═══════════════════════════════════════════════════════════════════════\n');
searchStats.forEach(lineStat => {
console.log(`📄 Line Item #${lineStat.lineNumber}:`);
console.log('─'.repeat(75));
// searchStats.forEach(lineStat => {
// console.log(`📄 Line Item #${lineStat.lineNumber}:`);
// console.log('─'.repeat(75));
if (lineStat.searches.length > 0) {
const tableData = lineStat.searches.map(search => ({
'Search Type': search.type,
'Search Term': search.term.substring(0, 40) + (search.term.length > 40 ? '...' : ''),
'Results': search.results
}));
console.table(tableData);
} else {
console.log(' No searches performed for this line item.\n');
}
});
// if (lineStat.searches.length > 0) {
// const tableData = lineStat.searches.map(search => ({
// 'Search Type': search.type,
// 'Search Term': search.term.substring(0, 40) + (search.term.length > 40 ? '...' : ''),
// 'Results': search.results
// }));
// console.table(tableData);
// } else {
// console.log(' No searches performed for this line item.\n');
// }
// });
// Summary statistics
const totalSearches = searchStats.reduce((sum, stat) => sum + stat.searches.length, 0);
const totalResults = searchStats.reduce((sum, stat) =>
sum + stat.searches.reduce((s, search) => s + search.results, 0), 0);
const avgResultsPerSearch = totalSearches > 0 ? (totalResults / totalSearches).toFixed(2) : 0;
// // Summary statistics
// const totalSearches = searchStats.reduce((sum, stat) => sum + stat.searches.length, 0);
// const totalResults = searchStats.reduce((sum, stat) =>
// sum + stat.searches.reduce((s, search) => s + search.results, 0), 0);
// const avgResultsPerSearch = totalSearches > 0 ? (totalResults / totalSearches).toFixed(2) : 0;
console.log('═══════════════════════════════════════════════════════════════════════');
console.log(' SUMMARY');
console.log('═══════════════════════════════════════════════════════════════════════');
console.table({
'Total Line Items': processedData.lineItems.length,
'Total Searches Performed': totalSearches,
'Total Results Found': totalResults,
'Average Results per Search': avgResultsPerSearch
});
console.log('═══════════════════════════════════════════════════════════════════════\n');
// console.log('═══════════════════════════════════════════════════════════════════════');
// console.log(' SUMMARY');
// console.log('═══════════════════════════════════════════════════════════════════════');
// console.table({
// 'Total Line Items': processedData.lineItems.length,
// 'Total Searches Performed': totalSearches,
// 'Total Results Found': totalResults,
// 'Average Results per Search': avgResultsPerSearch
// });
// console.log('═══════════════════════════════════════════════════════════════════════\n');
return matches
}