From c59acb1b72010950c2e545daf1994402e0245dfa Mon Sep 17 00:00:00 2001
From: Patrick Fic <patrick@imexsystems.ca>
Date: Mon, 9 Feb 2026 14:47:20 -0800
Subject: [PATCH] IO-3515 add confidence scoring

---
 server/ai/bill-ocr/bill-ocr-generator.js | 137 +++++++++++++++++++++--
 1 file changed, 128 insertions(+), 9 deletions(-)

diff --git a/server/ai/bill-ocr/bill-ocr-generator.js b/server/ai/bill-ocr/bill-ocr-generator.js
index 49da44682..b366cb2ca 100644
--- a/server/ai/bill-ocr/bill-ocr-generator.js
+++ b/server/ai/bill-ocr/bill-ocr-generator.js
@@ -19,6 +19,103 @@ const normalizePrice = (str) => {
     return str.replace(/[^0-9.-]+/g, "");
 };
 
+// Helper function to calculate Textract OCR confidence (0-100%)
+const calculateTextractConfidence = (textractLineItem) => {
+    if (!textractLineItem || Object.keys(textractLineItem).length === 0) {
+        return 0;
+    }
+
+    const confidenceValues = [];
+
+    // Collect confidence from all fields in the line item
+    Object.values(textractLineItem).forEach(field => {
+        if (field.confidence && typeof field.confidence === 'number') {
+            confidenceValues.push(field.confidence);
+        }
+    });
+
+    if (confidenceValues.length === 0) {
+        return 0;
+    }
+
+    // Calculate weighted average, giving more weight to important fields
+    // If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher
+    let totalWeight = 0;
+    let weightedSum = 0;
+
+    Object.entries(textractLineItem).forEach(([key, field]) => {
+        if (field.confidence && typeof field.confidence === 'number') {
+            // Weight important fields higher
+            let weight = 1;
+            if (key === 'ITEM' || key === 'PRODUCT_CODE') {
+                weight = 2; // Description and part number are most important
+            } else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
+                weight = 1.5; // Price and quantity moderately important
+            }
+
+            weightedSum += field.confidence * weight;
+            totalWeight += weight;
+        }
+    });
+
+    const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
+    return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places
+};
+
+// Helper function to calculate match confidence score (0-100%)
+const calculateMatchConfidence = (matches, bestMatch) => {
+    if (!matches || matches.length === 0 || !bestMatch) {
+        return 0; // No match = 0% confidence
+    }
+
+    // Base confidence from the match score
+    // finalScore is already weighted and higher is better
+    // Normalize it to a 0-100 scale
+    const baseScore = Math.min(bestMatch.finalScore * 10, 100); // Scale factor of 10, cap at 100
+
+    // Bonus for multiple field matches (up to +15%)
+    const fieldMatchBonus = Math.min(bestMatch.fieldMatches.length * 5, 15);
+
+    // Bonus for having price data (+10%)
+    const priceDataBonus = bestMatch.hasPriceData ? 10 : 0;
+
+    // Bonus for clear winner (gap between 1st and 2nd match)
+    let confidenceMarginBonus = 0;
+    if (matches.length > 1) {
+        const scoreDiff = bestMatch.finalScore - matches[1].finalScore;
+        // If the best match is significantly better than the second best, add bonus
+        confidenceMarginBonus = Math.min(scoreDiff * 5, 10); // Up to +10%
+    } else {
+        // Only one match found, add small bonus
+        confidenceMarginBonus = 5;
+    }
+
+    // Calculate total match confidence
+    let matchConfidence = baseScore + fieldMatchBonus + priceDataBonus + confidenceMarginBonus;
+
+    // Cap at 100% and round to 2 decimal places
+    matchConfidence = Math.min(Math.round(matchConfidence * 100) / 100, 100);
+
+    // Ensure minimum of 1% if there's any match at all
+    return Math.max(matchConfidence, 1);
+};
+
+// Helper function to calculate overall confidence combining OCR and match confidence
+const calculateOverallConfidence = (ocrConfidence, matchConfidence) => {
+    // If there's no match, OCR confidence doesn't matter much
+    if (matchConfidence === 0) {
+        return 0;
+    }
+
+    // Overall confidence is affected by both how well Textract read the data
+    // and how well we matched it to existing joblines
+    // Use a weighted average: 40% OCR confidence, 60% match confidence
+    // Match confidence is more important because even perfect OCR is useless without a good match
+    const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6);
+
+    return Math.round(overall * 100) / 100;
+};
+
 // Helper function to merge and deduplicate results with weighted scoring
 const mergeResults = (resultsArray, weights = []) => {
     const scoreMap = new Map();
@@ -102,21 +199,33 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
 
     //Create fuses of line descriptions for matching.
     const jobLineDescFuse = new Fuse(
-        jobData.jobs_by_pk.joblines,
+        jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })),
         {
             keys: [{
                 name: 'line_desc',
-                weight: 4
+                weight: 6
             }, {
                 name: 'oem_partno',
-                weight: 5
+                weight: 8
             }, {
                 name: 'alt_partno',
-                weight: 3
+                weight: 5
             },
             {
                 name: 'act_price',
                 weight: 1
+            },
+            {
+                name: 'line_desc_normalized',
+                weight: 4
+            },
+            {
+                name: 'oem_partno_normalized',
+                weight: 5
+            },
+            {
+                name: 'alt_partno_normalized',
+                weight: 3
             }],
             threshold: 0.4, //Adjust as needed for matching sensitivity,
             includeScore: true,
@@ -124,7 +233,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
         }
     );
     const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
-    console.log("*** ~ generateBillFormData ~ joblineMatches:", joblineMatches);
+    console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2));
 
     const vendorFuse = new Fuse(
         jobData.vendors,
@@ -137,7 +246,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
     );
 
     const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
-    console.log("*** ~ generateBillFormData ~ vendorMatches:", vendorMatches);
+    console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2));
     let vendorid;
     if (vendorMatches.length > 0) {
         vendorid = vendorMatches[0].item.id;
@@ -162,6 +271,11 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
             const { matches, textractLineItem, } = jlMatchLine
             //Matches should be prioritized, take the first one. 
             const matchToUse = matches.length > 0 ? matches[0] : null;
+
+            // Calculate confidence scores (0-100%)
+            const ocrConfidence = calculateTextractConfidence(textractLineItem);
+            const matchConfidence = calculateMatchConfidence(matches, matchToUse);
+            const overallConfidence = calculateOverallConfidence(ocrConfidence, matchConfidence);
             //TODO: Should be using the textract if there is an exact match on the normalized label.
             //if there isn't then we can do the below. 
 
@@ -224,6 +338,8 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
                     "local": false
                 },
                 "joblineid": matchToUse?.item?.id || "noline",
+                "confidence": `T${overallConfidence} - O${ocrConfidence} - J${matchConfidence}`
+
             }
             return lineObject
         })
@@ -294,11 +410,14 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) {
             const dashSearch = fuseToSearch.search(withDashes);
             lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length });
 
-            //  4: Search letters only (remove numbers and special chars)
+            //  4: Special chars to spaces (preserve word boundaries)
+            const specialCharsToSpaces = productCode.replace(/[^a-zA-Z0-9\s]/g, ' ').replace(/\s+/g, ' ').trim().toUpperCase();
+            const specialCharsSearch = fuseToSearch.search(specialCharsToSpaces);
+            lineStats.searches.push({ type: 'PRODUCT_CODE - Special Chars to Spaces', term: specialCharsToSpaces, results: specialCharsSearch.length });
 
             return mergeResults(
-                [normalizedSearch, minimalSearch, dashSearch],
-                [1.0, 1.1, 1.2] // Prefer fully normalized, letters-only weighted less
+                [normalizedSearch, minimalSearch, dashSearch, specialCharsSearch],
+                [1.0, 1.1, 1.2, 1.15] // Prefer fully normalized, special chars to spaces slightly weighted
             );
         })();