IO-3515 add confidence scoring
This commit is contained in:
@@ -19,6 +19,103 @@ const normalizePrice = (str) => {
|
|||||||
return str.replace(/[^0-9.-]+/g, "");
|
return str.replace(/[^0-9.-]+/g, "");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Helper function to calculate Textract OCR confidence (0-100%)
|
||||||
|
const calculateTextractConfidence = (textractLineItem) => {
|
||||||
|
if (!textractLineItem || Object.keys(textractLineItem).length === 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const confidenceValues = [];
|
||||||
|
|
||||||
|
// Collect confidence from all fields in the line item
|
||||||
|
Object.values(textractLineItem).forEach(field => {
|
||||||
|
if (field.confidence && typeof field.confidence === 'number') {
|
||||||
|
confidenceValues.push(field.confidence);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (confidenceValues.length === 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate weighted average, giving more weight to important fields
|
||||||
|
// If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher
|
||||||
|
let totalWeight = 0;
|
||||||
|
let weightedSum = 0;
|
||||||
|
|
||||||
|
Object.entries(textractLineItem).forEach(([key, field]) => {
|
||||||
|
if (field.confidence && typeof field.confidence === 'number') {
|
||||||
|
// Weight important fields higher
|
||||||
|
let weight = 1;
|
||||||
|
if (key === 'ITEM' || key === 'PRODUCT_CODE') {
|
||||||
|
weight = 2; // Description and part number are most important
|
||||||
|
} else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') {
|
||||||
|
weight = 1.5; // Price and quantity moderately important
|
||||||
|
}
|
||||||
|
|
||||||
|
weightedSum += field.confidence * weight;
|
||||||
|
totalWeight += weight;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
||||||
|
return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper function to calculate match confidence score (0-100%)
|
||||||
|
const calculateMatchConfidence = (matches, bestMatch) => {
|
||||||
|
if (!matches || matches.length === 0 || !bestMatch) {
|
||||||
|
return 0; // No match = 0% confidence
|
||||||
|
}
|
||||||
|
|
||||||
|
// Base confidence from the match score
|
||||||
|
// finalScore is already weighted and higher is better
|
||||||
|
// Normalize it to a 0-100 scale
|
||||||
|
const baseScore = Math.min(bestMatch.finalScore * 10, 100); // Scale factor of 10, cap at 100
|
||||||
|
|
||||||
|
// Bonus for multiple field matches (up to +15%)
|
||||||
|
const fieldMatchBonus = Math.min(bestMatch.fieldMatches.length * 5, 15);
|
||||||
|
|
||||||
|
// Bonus for having price data (+10%)
|
||||||
|
const priceDataBonus = bestMatch.hasPriceData ? 10 : 0;
|
||||||
|
|
||||||
|
// Bonus for clear winner (gap between 1st and 2nd match)
|
||||||
|
let confidenceMarginBonus = 0;
|
||||||
|
if (matches.length > 1) {
|
||||||
|
const scoreDiff = bestMatch.finalScore - matches[1].finalScore;
|
||||||
|
// If the best match is significantly better than the second best, add bonus
|
||||||
|
confidenceMarginBonus = Math.min(scoreDiff * 5, 10); // Up to +10%
|
||||||
|
} else {
|
||||||
|
// Only one match found, add small bonus
|
||||||
|
confidenceMarginBonus = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate total match confidence
|
||||||
|
let matchConfidence = baseScore + fieldMatchBonus + priceDataBonus + confidenceMarginBonus;
|
||||||
|
|
||||||
|
// Cap at 100% and round to 2 decimal places
|
||||||
|
matchConfidence = Math.min(Math.round(matchConfidence * 100) / 100, 100);
|
||||||
|
|
||||||
|
// Ensure minimum of 1% if there's any match at all
|
||||||
|
return Math.max(matchConfidence, 1);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper function to calculate overall confidence combining OCR and match confidence
|
||||||
|
const calculateOverallConfidence = (ocrConfidence, matchConfidence) => {
|
||||||
|
// If there's no match, OCR confidence doesn't matter much
|
||||||
|
if (matchConfidence === 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Overall confidence is affected by both how well Textract read the data
|
||||||
|
// and how well we matched it to existing joblines
|
||||||
|
// Use a weighted average: 40% OCR confidence, 60% match confidence
|
||||||
|
// Match confidence is more important because even perfect OCR is useless without a good match
|
||||||
|
const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6);
|
||||||
|
|
||||||
|
return Math.round(overall * 100) / 100;
|
||||||
|
};
|
||||||
|
|
||||||
// Helper function to merge and deduplicate results with weighted scoring
|
// Helper function to merge and deduplicate results with weighted scoring
|
||||||
const mergeResults = (resultsArray, weights = []) => {
|
const mergeResults = (resultsArray, weights = []) => {
|
||||||
const scoreMap = new Map();
|
const scoreMap = new Map();
|
||||||
@@ -102,21 +199,33 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
|||||||
|
|
||||||
//Create fuses of line descriptions for matching.
|
//Create fuses of line descriptions for matching.
|
||||||
const jobLineDescFuse = new Fuse(
|
const jobLineDescFuse = new Fuse(
|
||||||
jobData.jobs_by_pk.joblines,
|
jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })),
|
||||||
{
|
{
|
||||||
keys: [{
|
keys: [{
|
||||||
name: 'line_desc',
|
name: 'line_desc',
|
||||||
weight: 4
|
weight: 6
|
||||||
}, {
|
}, {
|
||||||
name: 'oem_partno',
|
name: 'oem_partno',
|
||||||
weight: 5
|
weight: 8
|
||||||
}, {
|
}, {
|
||||||
name: 'alt_partno',
|
name: 'alt_partno',
|
||||||
weight: 3
|
weight: 5
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'act_price',
|
name: 'act_price',
|
||||||
weight: 1
|
weight: 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'line_desc_normalized',
|
||||||
|
weight: 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'oem_partno_normalized',
|
||||||
|
weight: 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'alt_partno_normalized',
|
||||||
|
weight: 3
|
||||||
}],
|
}],
|
||||||
threshold: 0.4, //Adjust as needed for matching sensitivity,
|
threshold: 0.4, //Adjust as needed for matching sensitivity,
|
||||||
includeScore: true,
|
includeScore: true,
|
||||||
@@ -124,7 +233,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
|
const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData });
|
||||||
console.log("*** ~ generateBillFormData ~ joblineMatches:", joblineMatches);
|
console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2));
|
||||||
|
|
||||||
const vendorFuse = new Fuse(
|
const vendorFuse = new Fuse(
|
||||||
jobData.vendors,
|
jobData.vendors,
|
||||||
@@ -137,7 +246,7 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
|||||||
);
|
);
|
||||||
|
|
||||||
const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
|
const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value);
|
||||||
console.log("*** ~ generateBillFormData ~ vendorMatches:", vendorMatches);
|
console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2));
|
||||||
let vendorid;
|
let vendorid;
|
||||||
if (vendorMatches.length > 0) {
|
if (vendorMatches.length > 0) {
|
||||||
vendorid = vendorMatches[0].item.id;
|
vendorid = vendorMatches[0].item.id;
|
||||||
@@ -162,6 +271,11 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
|||||||
const { matches, textractLineItem, } = jlMatchLine
|
const { matches, textractLineItem, } = jlMatchLine
|
||||||
//Matches should be prioritized, take the first one.
|
//Matches should be prioritized, take the first one.
|
||||||
const matchToUse = matches.length > 0 ? matches[0] : null;
|
const matchToUse = matches.length > 0 ? matches[0] : null;
|
||||||
|
|
||||||
|
// Calculate confidence scores (0-100%)
|
||||||
|
const ocrConfidence = calculateTextractConfidence(textractLineItem);
|
||||||
|
const matchConfidence = calculateMatchConfidence(matches, matchToUse);
|
||||||
|
const overallConfidence = calculateOverallConfidence(ocrConfidence, matchConfidence);
|
||||||
//TODO: Should be using the textract if there is an exact match on the normalized label.
|
//TODO: Should be using the textract if there is an exact match on the normalized label.
|
||||||
//if there isn't then we can do the below.
|
//if there isn't then we can do the below.
|
||||||
|
|
||||||
@@ -224,6 +338,8 @@ async function generateBillFormData({ processedData, jobid, bodyshopid, partsord
|
|||||||
"local": false
|
"local": false
|
||||||
},
|
},
|
||||||
"joblineid": matchToUse?.item?.id || "noline",
|
"joblineid": matchToUse?.item?.id || "noline",
|
||||||
|
"confidence": `T${overallConfidence} - O${ocrConfidence} - J${matchConfidence}`
|
||||||
|
|
||||||
}
|
}
|
||||||
return lineObject
|
return lineObject
|
||||||
})
|
})
|
||||||
@@ -294,11 +410,14 @@ function joblineFuzzySearch({ fuseToSearch, processedData }) {
|
|||||||
const dashSearch = fuseToSearch.search(withDashes);
|
const dashSearch = fuseToSearch.search(withDashes);
|
||||||
lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length });
|
lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length });
|
||||||
|
|
||||||
// 4: Search letters only (remove numbers and special chars)
|
// 4: Special chars to spaces (preserve word boundaries)
|
||||||
|
const specialCharsToSpaces = productCode.replace(/[^a-zA-Z0-9\s]/g, ' ').replace(/\s+/g, ' ').trim().toUpperCase();
|
||||||
|
const specialCharsSearch = fuseToSearch.search(specialCharsToSpaces);
|
||||||
|
lineStats.searches.push({ type: 'PRODUCT_CODE - Special Chars to Spaces', term: specialCharsToSpaces, results: specialCharsSearch.length });
|
||||||
|
|
||||||
return mergeResults(
|
return mergeResults(
|
||||||
[normalizedSearch, minimalSearch, dashSearch],
|
[normalizedSearch, minimalSearch, dashSearch, specialCharsSearch],
|
||||||
[1.0, 1.1, 1.2] // Prefer fully normalized, letters-only weighted less
|
[1.0, 1.1, 1.2, 1.15] // Prefer fully normalized, special chars to spaces slightly weighted
|
||||||
);
|
);
|
||||||
})();
|
})();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user