IO-3515 add confidence scoring

This commit is contained in:
Patrick Fic
2026-02-09 14:47:20 -08:00
parent 20dad2caba
commit c59acb1b72

View File

@@ -19,6 +19,103 @@ const normalizePrice = (str) => {
return str.replace(/[^0-9.-]+/g, "");
};
// Helper function to calculate Textract OCR confidence (0-100%)
const calculateTextractConfidence = (textractLineItem) => {
if (!textractLineItem || Object.keys(textractLineItem).length === 0) {
return 0;
}
const confidenceValues = [];
// Collect confidence from all fields in the line item
Object.values(textractLineItem).forEach(field => {
if (field.confidence && typeof field.confidence === 'number') {
confidenceValues.push(field.confidence);
}
});
if (confidenceValues.length === 0) {
return 0;
}
// Calculate weighted average, giving more weight to important fields
// If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher
let totalWeight = 0;
let weightedSum = 0;
Object.entries(textractLineItem).forEach(([key, field]) => {
if (field.confidence && typeof field.confidence === 'number') {
// Weight important fields higher
let weight = 1;
if (key === 'ITEM' || key === 'PRODUCT_CODE') {
weight = 2; // Description and part number are most important
} else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
weight = 1.5; // Price and quantity moderately important
}
weightedSum += field.confidence * weight;
totalWeight += weight;
}
});
const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places
};
// Helper function to calculate match confidence score (0-100%)
const calculateMatchConfidence = (matches, bestMatch) => {
if (!matches || matches.length === 0 || !bestMatch) {
return 0; // No match = 0% confidence
}
// Base confidence from the match score
// finalScore is already weighted and higher is better
// Normalize it to a 0-100 scale
const baseScore = Math.min(bestMatch.finalScore * 10, 100); // Scale factor of 10, cap at 100
// Bonus for multiple field matches (up to +15%)
const fieldMatchBonus = Math.min(bestMatch.fieldMatches.length * 5, 15);
// Bonus for having price data (+10%)
const priceDataBonus = bestMatch.hasPriceData ? 10 : 0;
// Bonus for clear winner (gap between 1st and 2nd match)
let confidenceMarginBonus = 0;
if (matches.length > 1) {
const scoreDiff = bestMatch.finalScore - matches[1].finalScore;
// If the best match is significantly better than the second best, add bonus
confidenceMarginBonus = Math.min(scoreDiff * 5, 10); // Up to +10%
} else {
// Only one match found, add small bonus
confidenceMarginBonus = 5;
}
// Calculate total match confidence
let matchConfidence = baseScore + fieldMatchBonus + priceDataBonus + confidenceMarginBonus;
// Cap at 100% and round to 2 decimal places
matchConfidence = Math.min(Math.round(matchConfidence * 100) / 100, 100);
// Ensure minimum of 1% if there's any match at all
return Math.max(matchConfidence, 1);
};
// Helper function to calculate overall confidence combining OCR and match confidence
const calculateOverallConfidence = (ocrConfidence, matchConfidence) => {
// If there's no match, OCR confidence doesn't matter much
if (matchConfidence === 0) {
return 0;
}
// Overall confidence is affected by both how well Textract read the data
// and how well we matched it to existing joblines
// Use a weighted average: 40% OCR confidence, 60% match confidence
// Match confidence is more important because even perfect OCR is useless without a good match
const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6);
return Math.round(overall * 100) / 100;
};
// Helper function to merge and deduplicate results with weighted scoring
const mergeResults = (resultsArray, weights = []) => {
const scoreMap = new Map();
@@ -102,21 +199,33 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
//Create fuses of line descriptions for matching.
const jobLineDescFuse = new Fuse(
jobData.jobs_by_pk.joblines,
jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })),
{
keys: [{
name: 'line_desc',
weight: 4
weight: 6
}, {
name: 'oem_partno',
weight: 5
weight: 8
}, {
name: 'alt_partno',
weight: 3
weight: 5
},
{
name: 'act_price',
weight: 1
},
{
name: 'line_desc_normalized',
weight: 4
},
{
name: 'oem_partno_normalized',
weight: 5
},
{
name: 'alt_partno_normalized',
weight: 3
}],
threshold: 0.4, //Adjust as needed for matching sensitivity,
includeScore: true,
@@ -124,7 +233,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
}
);
const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
console.log("*** ~ generateBillFormData ~ joblineMatches:", joblineMatches);
console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2));
const vendorFuse = new Fuse(
jobData.vendors,
@@ -137,7 +246,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
);
const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
console.log("*** ~ generateBillFormData ~ vendorMatches:", vendorMatches);
console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2));
let vendorid;
if (vendorMatches.length > 0) {
vendorid = vendorMatches[0].item.id;
@@ -162,6 +271,11 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
const { matches, textractLineItem, } = jlMatchLine
//Matches should be prioritized, take the first one.
const matchToUse = matches.length > 0 ? matches[0] : null;
// Calculate confidence scores (0-100%)
const ocrConfidence = calculateTextractConfidence(textractLineItem);
const matchConfidence = calculateMatchConfidence(matches, matchToUse);
const overallConfidence = calculateOverallConfidence(ocrConfidence, matchConfidence);
//TODO: Should be using the textract if there is an exact match on the normalized label.
//if there isn't then we can do the below.
@@ -224,6 +338,8 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
"local": false
},
"joblineid": matchToUse?.item?.id || "noline",
"confidence": `T${overallConfidence} - O${ocrConfidence} - J${matchConfidence}`
}
return lineObject
})
@@ -294,11 +410,14 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) {
const dashSearch = fuseToSearch.search(withDashes);
lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length });
// 4: Search letters only (remove numbers and special chars)
// 4: Special chars to spaces (preserve word boundaries)
const specialCharsToSpaces = productCode.replace(/[^a-zA-Z0-9\s]/g, ' ').replace(/\s+/g, ' ').trim().toUpperCase();
const specialCharsSearch = fuseToSearch.search(specialCharsToSpaces);
lineStats.searches.push({ type: 'PRODUCT_CODE - Special Chars to Spaces', term: specialCharsToSpaces, results: specialCharsSearch.length });
return mergeResults(
[normalizedSearch, minimalSearch, dashSearch],
[1.0, 1.1, 1.2] // Prefer fully normalized, letters-only weighted less
[normalizedSearch, minimalSearch, dashSearch, specialCharsSearch],
[1.0, 1.1, 1.2, 1.15] // Prefer fully normalized, special chars to spaces slightly weighted
);
})();