IO-3515 add confidence scoring
This commit is contained in:
@@ -19,6 +19,103 @@ const normalizePrice = (str) => {
|
||||
return str.replace(/[^0-9.-]+/g, "");
|
||||
};
|
||||
|
||||
// Helper function to calculate Textract OCR confidence (0-100%)
|
||||
const calculateTextractConfidence = (textractLineItem) => {
|
||||
if (!textractLineItem || Object.keys(textractLineItem).length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const confidenceValues = [];
|
||||
|
||||
// Collect confidence from all fields in the line item
|
||||
Object.values(textractLineItem).forEach(field => {
|
||||
if (field.confidence && typeof field.confidence === 'number') {
|
||||
confidenceValues.push(field.confidence);
|
||||
}
|
||||
});
|
||||
|
||||
if (confidenceValues.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Calculate weighted average, giving more weight to important fields
|
||||
// If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher
|
||||
let totalWeight = 0;
|
||||
let weightedSum = 0;
|
||||
|
||||
Object.entries(textractLineItem).forEach(([key, field]) => {
|
||||
if (field.confidence && typeof field.confidence === 'number') {
|
||||
// Weight important fields higher
|
||||
let weight = 1;
|
||||
if (key === 'ITEM' || key === 'PRODUCT_CODE') {
|
||||
weight = 2; // Description and part number are most important
|
||||
} else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
|
||||
weight = 1.5; // Price and quantity moderately important
|
||||
}
|
||||
|
||||
weightedSum += field.confidence * weight;
|
||||
totalWeight += weight;
|
||||
}
|
||||
});
|
||||
|
||||
const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
||||
return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places
|
||||
};
|
||||
|
||||
// Helper function to calculate match confidence score (0-100%)
|
||||
const calculateMatchConfidence = (matches, bestMatch) => {
|
||||
if (!matches || matches.length === 0 || !bestMatch) {
|
||||
return 0; // No match = 0% confidence
|
||||
}
|
||||
|
||||
// Base confidence from the match score
|
||||
// finalScore is already weighted and higher is better
|
||||
// Normalize it to a 0-100 scale
|
||||
const baseScore = Math.min(bestMatch.finalScore * 10, 100); // Scale factor of 10, cap at 100
|
||||
|
||||
// Bonus for multiple field matches (up to +15%)
|
||||
const fieldMatchBonus = Math.min(bestMatch.fieldMatches.length * 5, 15);
|
||||
|
||||
// Bonus for having price data (+10%)
|
||||
const priceDataBonus = bestMatch.hasPriceData ? 10 : 0;
|
||||
|
||||
// Bonus for clear winner (gap between 1st and 2nd match)
|
||||
let confidenceMarginBonus = 0;
|
||||
if (matches.length > 1) {
|
||||
const scoreDiff = bestMatch.finalScore - matches[1].finalScore;
|
||||
// If the best match is significantly better than the second best, add bonus
|
||||
confidenceMarginBonus = Math.min(scoreDiff * 5, 10); // Up to +10%
|
||||
} else {
|
||||
// Only one match found, add small bonus
|
||||
confidenceMarginBonus = 5;
|
||||
}
|
||||
|
||||
// Calculate total match confidence
|
||||
let matchConfidence = baseScore + fieldMatchBonus + priceDataBonus + confidenceMarginBonus;
|
||||
|
||||
// Cap at 100% and round to 2 decimal places
|
||||
matchConfidence = Math.min(Math.round(matchConfidence * 100) / 100, 100);
|
||||
|
||||
// Ensure minimum of 1% if there's any match at all
|
||||
return Math.max(matchConfidence, 1);
|
||||
};
|
||||
|
||||
// Helper function to calculate overall confidence combining OCR and match confidence
|
||||
const calculateOverallConfidence = (ocrConfidence, matchConfidence) => {
|
||||
// If there's no match, OCR confidence doesn't matter much
|
||||
if (matchConfidence === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Overall confidence is affected by both how well Textract read the data
|
||||
// and how well we matched it to existing joblines
|
||||
// Use a weighted average: 40% OCR confidence, 60% match confidence
|
||||
// Match confidence is more important because even perfect OCR is useless without a good match
|
||||
const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6);
|
||||
|
||||
return Math.round(overall * 100) / 100;
|
||||
};
|
||||
|
||||
// Helper function to merge and deduplicate results with weighted scoring
|
||||
const mergeResults = (resultsArray, weights = []) => {
|
||||
const scoreMap = new Map();
|
||||
@@ -102,21 +199,33 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
||||
|
||||
//Create fuses of line descriptions for matching.
|
||||
const jobLineDescFuse = new Fuse(
|
||||
jobData.jobs_by_pk.joblines,
|
||||
jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })),
|
||||
{
|
||||
keys: [{
|
||||
name: 'line_desc',
|
||||
weight: 4
|
||||
weight: 6
|
||||
}, {
|
||||
name: 'oem_partno',
|
||||
weight: 5
|
||||
weight: 8
|
||||
}, {
|
||||
name: 'alt_partno',
|
||||
weight: 3
|
||||
weight: 5
|
||||
},
|
||||
{
|
||||
name: 'act_price',
|
||||
weight: 1
|
||||
},
|
||||
{
|
||||
name: 'line_desc_normalized',
|
||||
weight: 4
|
||||
},
|
||||
{
|
||||
name: 'oem_partno_normalized',
|
||||
weight: 5
|
||||
},
|
||||
{
|
||||
name: 'alt_partno_normalized',
|
||||
weight: 3
|
||||
}],
|
||||
threshold: 0.4, //Adjust as needed for matching sensitivity,
|
||||
includeScore: true,
|
||||
@@ -124,7 +233,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
||||
}
|
||||
);
|
||||
const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
|
||||
console.log("*** ~ generateBillFormData ~ joblineMatches:", joblineMatches);
|
||||
console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2));
|
||||
|
||||
const vendorFuse = new Fuse(
|
||||
jobData.vendors,
|
||||
@@ -137,7 +246,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
||||
);
|
||||
|
||||
const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
|
||||
console.log("*** ~ generateBillFormData ~ vendorMatches:", vendorMatches);
|
||||
console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2));
|
||||
let vendorid;
|
||||
if (vendorMatches.length > 0) {
|
||||
vendorid = vendorMatches[0].item.id;
|
||||
@@ -162,6 +271,11 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
||||
const { matches, textractLineItem, } = jlMatchLine
|
||||
//Matches should be prioritized, take the first one.
|
||||
const matchToUse = matches.length > 0 ? matches[0] : null;
|
||||
|
||||
// Calculate confidence scores (0-100%)
|
||||
const ocrConfidence = calculateTextractConfidence(textractLineItem);
|
||||
const matchConfidence = calculateMatchConfidence(matches, matchToUse);
|
||||
const overallConfidence = calculateOverallConfidence(ocrConfidence, matchConfidence);
|
||||
//TODO: Should be using the textract if there is an exact match on the normalized label.
|
||||
//if there isn't then we can do the below.
|
||||
|
||||
@@ -224,6 +338,8 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
||||
"local": false
|
||||
},
|
||||
"joblineid": matchToUse?.item?.id || "noline",
|
||||
"confidence": `T${overallConfidence} - O${ocrConfidence} - J${matchConfidence}`
|
||||
|
||||
}
|
||||
return lineObject
|
||||
})
|
||||
@@ -294,11 +410,14 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) {
|
||||
const dashSearch = fuseToSearch.search(withDashes);
|
||||
lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length });
|
||||
|
||||
// 4: Search letters only (remove numbers and special chars)
|
||||
// 4: Special chars to spaces (preserve word boundaries)
|
||||
const specialCharsToSpaces = productCode.replace(/[^a-zA-Z0-9\s]/g, ' ').replace(/\s+/g, ' ').trim().toUpperCase();
|
||||
const specialCharsSearch = fuseToSearch.search(specialCharsToSpaces);
|
||||
lineStats.searches.push({ type: 'PRODUCT_CODE - Special Chars to Spaces', term: specialCharsToSpaces, results: specialCharsSearch.length });
|
||||
|
||||
return mergeResults(
|
||||
[normalizedSearch, minimalSearch, dashSearch],
|
||||
[1.0, 1.1, 1.2] // Prefer fully normalized, letters-only weighted less
|
||||
[normalizedSearch, minimalSearch, dashSearch, specialCharsSearch],
|
||||
[1.0, 1.1, 1.2, 1.15] // Prefer fully normalized, special chars to spaces slightly weighted
|
||||
);
|
||||
})();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user