From c59acb1b72010950c2e545daf1994402e0245dfa Mon Sep 17 00:00:00 2001 From: Patrick Fic Date: Mon, 9 Feb 2026 14:47:20 -0800 Subject: [PATCH] IO-3515 add confidence scoring --- server/ai/bill-ocr/bill-ocr-generator.js | 137 +++++++++++++++++++++-- 1 file changed, 128 insertions(+), 9 deletions(-) diff --git a/server/ai/bill-ocr/bill-ocr-generator.js b/server/ai/bill-ocr/bill-ocr-generator.js index 49da44682..b366cb2ca 100644 --- a/server/ai/bill-ocr/bill-ocr-generator.js +++ b/server/ai/bill-ocr/bill-ocr-generator.js @@ -19,6 +19,103 @@ const normalizePrice = (str) => { return str.replace(/[^0-9.-]+/g, ""); }; +// Helper function to calculate Textract OCR confidence (0-100%) +const calculateTextractConfidence = (textractLineItem) => { + if (!textractLineItem || Object.keys(textractLineItem).length === 0) { + return 0; + } + + const confidenceValues = []; + + // Collect confidence from all fields in the line item + Object.values(textractLineItem).forEach(field => { + if (field.confidence && typeof field.confidence === 'number') { + confidenceValues.push(field.confidence); + } + }); + + if (confidenceValues.length === 0) { + return 0; + } + + // Calculate weighted average, giving more weight to important fields + // If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher + let totalWeight = 0; + let weightedSum = 0; + + Object.entries(textractLineItem).forEach(([key, field]) => { + if (field.confidence && typeof field.confidence === 'number') { + // Weight important fields higher + let weight = 1; + if (key === 'ITEM' || key === 'PRODUCT_CODE') { + weight = 2; // Description and part number are most important + } else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') { + weight = 1.5; // Price and quantity moderately important + } + + weightedSum += field.confidence * weight; + totalWeight += weight; + } + }); + + const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0; + return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places +}; + +// Helper function to calculate match confidence score (0-100%) +const calculateMatchConfidence = (matches, bestMatch) => { + if (!matches || matches.length === 0 || !bestMatch) { + return 0; // No match = 0% confidence + } + + // Base confidence from the match score + // finalScore is already weighted and higher is better + // Normalize it to a 0-100 scale + const baseScore = Math.min(bestMatch.finalScore * 10, 100); // Scale factor of 10, cap at 100 + + // Bonus for multiple field matches (up to +15%) + const fieldMatchBonus = Math.min(bestMatch.fieldMatches.length * 5, 15); + + // Bonus for having price data (+10%) + const priceDataBonus = bestMatch.hasPriceData ? 10 : 0; + + // Bonus for clear winner (gap between 1st and 2nd match) + let confidenceMarginBonus = 0; + if (matches.length > 1) { + const scoreDiff = bestMatch.finalScore - matches[1].finalScore; + // If the best match is significantly better than the second best, add bonus + confidenceMarginBonus = Math.min(scoreDiff * 5, 10); // Up to +10% + } else { + // Only one match found, add small bonus + confidenceMarginBonus = 5; + } + + // Calculate total match confidence + let matchConfidence = baseScore + fieldMatchBonus + priceDataBonus + confidenceMarginBonus; + + // Cap at 100% and round to 2 decimal places + matchConfidence = Math.min(Math.round(matchConfidence * 100) / 100, 100); + + // Ensure minimum of 1% if there's any match at all + return Math.max(matchConfidence, 1); +}; + +// Helper function to calculate overall confidence combining OCR and match confidence +const calculateOverallConfidence = (ocrConfidence, matchConfidence) => { + // If there's no match, OCR confidence doesn't matter much + if (matchConfidence === 0) { + return 0; + } + + // Overall confidence is affected by both how well Textract read the data + // and how well we matched it to existing joblines + // Use a weighted average: 40% OCR confidence, 60% match confidence + // Match confidence is more important because even perfect OCR is useless without a good match + const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6); + + return Math.round(overall * 100) / 100; +}; + // Helper function to merge and deduplicate results with weighted scoring const mergeResults = (resultsArray, weights = []) => { const scoreMap = new Map(); @@ -102,21 +199,33 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord //Create fuses of line descriptions for matching. const jobLineDescFuse = new Fuse( - jobData.jobs_by_pk.joblines, + jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })), { keys: [{ name: 'line_desc', - weight: 4 + weight: 6 }, { name: 'oem_partno', - weight: 5 + weight: 8 }, { name: 'alt_partno', - weight: 3 + weight: 5 }, { name: 'act_price', weight: 1 + }, + { + name: 'line_desc_normalized', + weight: 4 + }, + { + name: 'oem_partno_normalized', + weight: 5 + }, + { + name: 'alt_partno_normalized', + weight: 3 }], threshold: 0.4, //Adjust as needed for matching sensitivity, includeScore: true, @@ -124,7 +233,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord } ); const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData }); - console.log("*** ~ generateBillFormData ~ joblineMatches:", joblineMatches); + console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2)); const vendorFuse = new Fuse( jobData.vendors, @@ -137,7 +246,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord ); const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value); - console.log("*** ~ generateBillFormData ~ vendorMatches:", vendorMatches); + console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2)); let vendorid; if (vendorMatches.length > 0) { vendorid = vendorMatches[0].item.id; @@ -162,6 +271,11 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord const { matches, textractLineItem, } = jlMatchLine //Matches should be prioritized, take the first one. const matchToUse = matches.length > 0 ? matches[0] : null; + + // Calculate confidence scores (0-100%) + const ocrConfidence = calculateTextractConfidence(textractLineItem); + const matchConfidence = calculateMatchConfidence(matches, matchToUse); + const overallConfidence = calculateOverallConfidence(ocrConfidence, matchConfidence); //TODO: Should be using the textract if there is an exact match on the normalized label. //if there isn't then we can do the below. @@ -224,6 +338,8 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord "local": false }, "joblineid": matchToUse?.item?.id || "noline", + "confidence": `T${overallConfidence} - O${ocrConfidence} - J${matchConfidence}` + } return lineObject }) @@ -294,11 +410,14 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) { const dashSearch = fuseToSearch.search(withDashes); lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length }); - // 4: Search letters only (remove numbers and special chars) + // 4: Special chars to spaces (preserve word boundaries) + const specialCharsToSpaces = productCode.replace(/[^a-zA-Z0-9\s]/g, ' ').replace(/\s+/g, ' ').trim().toUpperCase(); + const specialCharsSearch = fuseToSearch.search(specialCharsToSpaces); + lineStats.searches.push({ type: 'PRODUCT_CODE - Special Chars to Spaces', term: specialCharsToSpaces, results: specialCharsSearch.length }); return mergeResults( - [normalizedSearch, minimalSearch, dashSearch], - [1.0, 1.1, 1.2] // Prefer fully normalized, letters-only weighted less + [normalizedSearch, minimalSearch, dashSearch, specialCharsSearch], + [1.0, 1.1, 1.2, 1.15] // Prefer fully normalized, special chars to spaces slightly weighted ); })();