IO-3515 add confidence scoring

2026-02-09 14:47:20 -08:00
parent 20dad2caba
commit c59acb1b72
1 changed files with 128 additions and 9 deletions
--- a/server/ai/bill-ocr/bill-ocr-generator.js
+++ b/server/ai/bill-ocr/bill-ocr-generator.js
@@ -19,6 +19,103 @@ const normalizePrice = (str) => {
    return str.replace(/[^0-9.-]+/g, "");
 };

+// Helper function to calculate Textract OCR confidence (0-100%)
+const calculateTextractConfidence = (textractLineItem) => {
+    if (!textractLineItem || Object.keys(textractLineItem).length === 0) {
+        return 0;
+    }
+
+    const confidenceValues = [];
+
+    // Collect confidence from all fields in the line item
+    Object.values(textractLineItem).forEach(field => {
+        if (field.confidence && typeof field.confidence === 'number') {
+            confidenceValues.push(field.confidence);
+        }
+    });
+
+    if (confidenceValues.length === 0) {
+        return 0;
+    }
+
+    // Calculate weighted average, giving more weight to important fields
+    // If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher
+    let totalWeight = 0;
+    let weightedSum = 0;
+
+    Object.entries(textractLineItem).forEach(([key, field]) => {
+        if (field.confidence && typeof field.confidence === 'number') {
+            // Weight important fields higher
+            let weight = 1;
+            if (key === 'ITEM' || key === 'PRODUCT_CODE') {
+                weight = 2; // Description and part number are most important
+            } else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
+                weight = 1.5; // Price and quantity moderately important
+            }
+
+            weightedSum += field.confidence * weight;
+            totalWeight += weight;
+        }
+    });
+
+    const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
+    return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places
+};
+
+// Helper function to calculate match confidence score (0-100%)
+const calculateMatchConfidence = (matches, bestMatch) => {
+    if (!matches || matches.length === 0 || !bestMatch) {
+        return 0; // No match = 0% confidence
+    }
+
+    // Base confidence from the match score
+    // finalScore is already weighted and higher is better
+    // Normalize it to a 0-100 scale
+    const baseScore = Math.min(bestMatch.finalScore * 10, 100); // Scale factor of 10, cap at 100
+
+    // Bonus for multiple field matches (up to +15%)
+    const fieldMatchBonus = Math.min(bestMatch.fieldMatches.length * 5, 15);
+
+    // Bonus for having price data (+10%)
+    const priceDataBonus = bestMatch.hasPriceData ? 10 : 0;
+
+    // Bonus for clear winner (gap between 1st and 2nd match)
+    let confidenceMarginBonus = 0;
+    if (matches.length > 1) {
+        const scoreDiff = bestMatch.finalScore - matches[1].finalScore;
+        // If the best match is significantly better than the second best, add bonus
+        confidenceMarginBonus = Math.min(scoreDiff * 5, 10); // Up to +10%
+    } else {
+        // Only one match found, add small bonus
+        confidenceMarginBonus = 5;
+    }
+
+    // Calculate total match confidence
+    let matchConfidence = baseScore + fieldMatchBonus + priceDataBonus + confidenceMarginBonus;
+
+    // Cap at 100% and round to 2 decimal places
+    matchConfidence = Math.min(Math.round(matchConfidence * 100) / 100, 100);
+
+    // Ensure minimum of 1% if there's any match at all
+    return Math.max(matchConfidence, 1);
+};
+
+// Helper function to calculate overall confidence combining OCR and match confidence
+const calculateOverallConfidence = (ocrConfidence, matchConfidence) => {
+    // If there's no match, OCR confidence doesn't matter much
+    if (matchConfidence === 0) {
+        return 0;
+    }
+
+    // Overall confidence is affected by both how well Textract read the data
+    // and how well we matched it to existing joblines
+    // Use a weighted average: 40% OCR confidence, 60% match confidence
+    // Match confidence is more important because even perfect OCR is useless without a good match
+    const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6);
+
+    return Math.round(overall * 100) / 100;
+};
+
 // Helper function to merge and deduplicate results with weighted scoring
 const mergeResults = (resultsArray, weights = []) => {
    const scoreMap = new Map();
@@ -102,21 +199,33 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord

    //Create fuses of line descriptions for matching.
    const jobLineDescFuse = new Fuse(
-        jobData.jobs_by_pk.joblines,
+        jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })),
        {
            keys: [{
                name: 'line_desc',
-                weight: 4
+                weight: 6
            }, {
                name: 'oem_partno',
-                weight: 5
+                weight: 8
            }, {
                name: 'alt_partno',
-                weight: 3
+                weight: 5
            },
            {
                name: 'act_price',
                weight: 1
+            },
+            {
+                name: 'line_desc_normalized',
+                weight: 4
+            },
+            {
+                name: 'oem_partno_normalized',
+                weight: 5
+            },
+            {
+                name: 'alt_partno_normalized',
+                weight: 3
            }],
            threshold: 0.4, //Adjust as needed for matching sensitivity,
            includeScore: true,
@@ -124,7 +233,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
        }
    );
    const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
-    console.log("*** ~ generateBillFormData ~ joblineMatches:", joblineMatches);
+    console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2));

    const vendorFuse = new Fuse(
        jobData.vendors,
@@ -137,7 +246,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
    );

    const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
-    console.log("*** ~ generateBillFormData ~ vendorMatches:", vendorMatches);
+    console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2));
    let vendorid;
    if (vendorMatches.length > 0) {
        vendorid = vendorMatches[0].item.id;
@@ -162,6 +271,11 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
            const { matches, textractLineItem, } = jlMatchLine
            //Matches should be prioritized, take the first one. 
            const matchToUse = matches.length > 0 ? matches[0] : null;
+
+            // Calculate confidence scores (0-100%)
+            const ocrConfidence = calculateTextractConfidence(textractLineItem);
+            const matchConfidence = calculateMatchConfidence(matches, matchToUse);
+            const overallConfidence = calculateOverallConfidence(ocrConfidence, matchConfidence);
            //TODO: Should be using the textract if there is an exact match on the normalized label.
            //if there isn't then we can do the below. 

@@ -224,6 +338,8 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
                    "local": false
                },
                "joblineid": matchToUse?.item?.id || "noline",
+                "confidence": `T${overallConfidence} - O${ocrConfidence} - J${matchConfidence}`
+
            }
            return lineObject
        })
@@ -294,11 +410,14 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) {
            const dashSearch = fuseToSearch.search(withDashes);
            lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length });

-            //  4: Search letters only (remove numbers and special chars)
+            //  4: Special chars to spaces (preserve word boundaries)
+            const specialCharsToSpaces = productCode.replace(/[^a-zA-Z0-9\s]/g, ' ').replace(/\s+/g, ' ').trim().toUpperCase();
+            const specialCharsSearch = fuseToSearch.search(specialCharsToSpaces);
+            lineStats.searches.push({ type: 'PRODUCT_CODE - Special Chars to Spaces', term: specialCharsToSpaces, results: specialCharsSearch.length });

            return mergeResults(
-                [normalizedSearch, minimalSearch, dashSearch],
-                [1.0, 1.1, 1.2] // Prefer fully normalized, letters-only weighted less
+                [normalizedSearch, minimalSearch, dashSearch, specialCharsSearch],
+                [1.0, 1.1, 1.2, 1.15] // Prefer fully normalized, special chars to spaces slightly weighted
            );
        })();