const Fuse = require('fuse.js'); const { has } = require("lodash"); const PRICE_PERCENT_MARGIN_TOLERANCE = 0.5; //Used to make sure prices and costs are likely. // Helper function to normalize fields const normalizePartNumber = (str) => { return str.replace(/[^a-zA-Z0-9]/g, '').toUpperCase(); }; const normalizeText = (str) => { return str.replace(/[^a-zA-Z0-9\s]/g, '').replace(/\s+/g, ' ').trim().toUpperCase(); }; const normalizePrice = (str) => { if (typeof str !== 'string') return str; return str.replace(/[^0-9.-]+/g, ""); }; // Helper function to calculate Textract OCR confidence (0-100%) const calculateTextractConfidence = (textractLineItem) => { if (!textractLineItem || Object.keys(textractLineItem).length === 0) { return 0; } const confidenceValues = []; // Collect confidence from all fields in the line item Object.values(textractLineItem).forEach(field => { if (field.confidence && typeof field.confidence === 'number') { confidenceValues.push(field.confidence); } }); if (confidenceValues.length === 0) { return 0; } // Calculate weighted average, giving more weight to important fields // If we can identify key fields (ITEM, PRODUCT_CODE, PRICE), weight them higher let totalWeight = 0; let weightedSum = 0; Object.entries(textractLineItem).forEach(([key, field]) => { if (field.confidence && typeof field.confidence === 'number') { // Weight important fields higher let weight = 1; if (key === 'ITEM' || key === 'PRODUCT_CODE') { weight = 2; // Description and part number are most important } else if (key === 'PRICE' || key === 'UNIT_PRICE' || key === 'QUANTITY') { weight = 1.5; // Price and quantity moderately important } weightedSum += field.confidence * weight; totalWeight += weight; } }); const avgConfidence = totalWeight > 0 ? weightedSum / totalWeight : 0; return Math.round(avgConfidence * 100) / 100; // Round to 2 decimal places }; // Helper function to calculate match confidence score (0-100%) const calculateMatchConfidence = (matches, bestMatch) => { if (!matches || matches.length === 0 || !bestMatch) { return 0; // No match = 0% confidence } // Base confidence from the match score // finalScore is already weighted and higher is better // Normalize it to a 0-100 scale const baseScore = Math.min(bestMatch.finalScore * 10, 100); // Scale factor of 10, cap at 100 // Bonus for multiple field matches (up to +15%) const fieldMatchBonus = Math.min(bestMatch.fieldMatches.length * 5, 15); // Bonus for having price data (+10%) const priceDataBonus = bestMatch.hasPriceData ? 10 : 0; // Bonus for clear winner (gap between 1st and 2nd match) let confidenceMarginBonus = 0; if (matches.length > 1) { const scoreDiff = bestMatch.finalScore - matches[1].finalScore; // If the best match is significantly better than the second best, add bonus confidenceMarginBonus = Math.min(scoreDiff * 5, 10); // Up to +10% } else { // Only one match found, add small bonus confidenceMarginBonus = 5; } // Calculate total match confidence let matchConfidence = baseScore + fieldMatchBonus + priceDataBonus + confidenceMarginBonus; // Cap at 100% and round to 2 decimal places matchConfidence = Math.min(Math.round(matchConfidence * 100) / 100, 100); // Ensure minimum of 1% if there's any match at all return Math.max(matchConfidence, 1); }; // Helper function to calculate overall confidence combining OCR and match confidence const calculateOverallConfidence = (ocrConfidence, matchConfidence) => { // If there's no match, OCR confidence doesn't matter much if (matchConfidence === 0) { return 0; } // Overall confidence is affected by both how well Textract read the data // and how well we matched it to existing joblines // Use a weighted average: 40% OCR confidence, 60% match confidence // Match confidence is more important because even perfect OCR is useless without a good match const overall = (ocrConfidence * 0.4) + (matchConfidence * 0.6); return Math.round(overall * 100) / 100; }; // Helper function to merge and deduplicate results with weighted scoring const mergeResults = (resultsArray, weights = []) => { const scoreMap = new Map(); resultsArray.forEach((results, index) => { const weight = weights[index] || 1; results.forEach(result => { const id = result.item.id; const weightedScore = result.score * weight; if (!scoreMap.has(id)) { scoreMap.set(id, { item: result.item, score: weightedScore, count: 1 }); } else { const existing = scoreMap.get(id); // Lower score is better in Fuse.js, so take the minimum existing.score = Math.min(existing.score, weightedScore); existing.count++; } }); }); // Convert back to array and sort by score (lower is better) return Array.from(scoreMap.values()) .sort((a, b) => { // Prioritize items found in multiple searches if (a.count !== b.count) return b.count - a.count; return a.score - b.score; }) .slice(0, 5); // Return top 5 results }; async function generateBillFormData({ processedData, jobid, bodyshopid, partsorderid, req }) { const client = req.userGraphQLClient; //TODO: Add in vendor data. const jobData = await client.request(` query QUERY_BILL_OCR_DATA($jobid: uuid!, $partsorderid: uuid!) { vendors{ id name } jobs_by_pk(id: $jobid) { id bodyshop{ id md_responsibility_centers cdk_dealerid pbs_serialnumber rr_dealerid } joblines { id line_desc removed act_price db_price oem_partno alt_partno part_type } } parts_orders_by_pk(id: $partsorderid) { id parts_order_lines { id line_desc act_price cost jobline { id line_desc act_price oem_partno alt_partno part_type } } } } `, { jobid, partsorderid // this may fail if null? }); //TODO: Need to find a vendor ID. Create a fuse for it, and fuzzy search for it using the textract vendor info. //Create fuses of line descriptions for matching. const jobLineDescFuse = new Fuse( jobData.jobs_by_pk.joblines.map(jl => ({ ...jl, line_desc_normalized: normalizeText(jl.line_desc || ""), oem_partno_normalized: normalizePartNumber(jl.oem_partno || ""), alt_partno_normalized: normalizePartNumber(jl.alt_partno || "") })), { keys: [{ name: 'line_desc', weight: 6 }, { name: 'oem_partno', weight: 8 }, { name: 'alt_partno', weight: 5 }, { name: 'act_price', weight: 1 }, { name: 'line_desc_normalized', weight: 4 }, { name: 'oem_partno_normalized', weight: 5 }, { name: 'alt_partno_normalized', weight: 3 }], threshold: 0.4, //Adjust as needed for matching sensitivity, includeScore: true, } ); const joblineMatches = joblineFuzzySearch({ fuseToSearch: jobLineDescFuse, processedData }); console.log("*** ~ generateBillFormData ~ joblineMatches:", JSON.stringify(joblineMatches, null, 2)); const vendorFuse = new Fuse( jobData.vendors, { keys: ['name'], threshold: 0.4, //Adjust as needed for matching sensitivity, includeScore: true, } ); const vendorMatches = vendorFuse.search(processedData.summary?.NAME?.value || processedData.summary?.VENDOR_NAME?.value); console.log("*** ~ generateBillFormData ~ vendorMatches:", JSON.stringify(vendorMatches, null, 2)); let vendorid; if (vendorMatches.length > 0) { vendorid = vendorMatches[0].item.id; } const { jobs_by_pk: job, parts_orders_by_pk: partsOrder } = jobData; if (!job) { throw new Error('Job not found for bill form data generation.'); } //Figure out which lines have a match and which don't. //TODO: How do we handle freight lines and core charges? //Create the form data structure for the bill posting screen. const billFormData = { "jobid": jobid, "vendorid": vendorid, "invoice_number": processedData.summary?.INVOICE_RECEIPT_ID?.value, "date": processedData.summary?.INVOICE_RECEIPT_DATE?.value, "is_credit_memo": false, "total": normalizePrice(processedData.summary?.INVOICE_TOTAL?.value || processedData.summary?.TOTAL?.value), "billlines": joblineMatches.map(jlMatchLine => { const { matches, textractLineItem, } = jlMatchLine //Matches should be prioritized, take the first one. const matchToUse = matches.length > 0 ? matches[0] : null; // Calculate confidence scores (0-100%) const ocrConfidence = calculateTextractConfidence(textractLineItem); const matchConfidence = calculateMatchConfidence(matches, matchToUse); const overallConfidence = calculateOverallConfidence(ocrConfidence, matchConfidence); //TODO: Should be using the textract if there is an exact match on the normalized label. //if there isn't then we can do the below. let actualPrice, actualCost; //TODO: What is several match on the normalized name? We need to pick the most likely one. const hasNormalizedActualPrice = Object.keys(textractLineItem).find(key => textractLineItem[key].normalizedLabel === 'actual_price'); const hasNormalizedActualCost = Object.keys(textractLineItem).find(key => textractLineItem[key].normalizedLabel === 'actual_cost'); if (hasNormalizedActualPrice) { actualPrice = textractLineItem[hasNormalizedActualPrice].value; } if (hasNormalizedActualCost) { actualCost = textractLineItem[hasNormalizedActualCost].value; } if (!hasNormalizedActualPrice || !hasNormalizedActualCost) { //This is if there was no match found for normalized labels. //Check all prices, and generally the higher one will be the actual price and the lower one will be the cost. //Need to make sure that other random items are excluded. This should be within a reasonable range of the matched jobline at matchToUse.item.act_price //Iterate over all of the text values, and check out which of them are currencies. //They'll be in the format starting with a $ sign usually. const currencyTextractLineItems = [] // {key, value} Object.keys(textractLineItem).forEach(key => { const currencyValue = textractLineItem[key].value?.startsWith('$') ? textractLineItem[key].value : null; if (currencyValue) { //Clean it and parse it const cleanValue = parseFloat(currencyValue.replace(/[^0-9.-]/g, '')) || 0; currencyTextractLineItems.push({ key, value: cleanValue }) } }) //Sort them descending currencyTextractLineItems.sort((a, b) => b.value - a.value); //Most expensive should be the actual price, second most expensive should be the cost. if (!actualPrice) actualPrice = currencyTextractLineItems.length > 0 ? currencyTextractLineItems[0].value : 0; if (!actualCost) actualCost = currencyTextractLineItems.length > 1 ? currencyTextractLineItems[1].value : 0; if (matchToUse) { //Double check that they're within 50% of the matched jobline price if there is one. const joblinePrice = parseFloat(matchToUse.item.act_price) || 0; if (!hasNormalizedActualPrice && actualPrice > 0 && (actualPrice < joblinePrice * (1 - PRICE_PERCENT_MARGIN_TOLERANCE) || actualPrice > joblinePrice * (1 + PRICE_PERCENT_MARGIN_TOLERANCE))) { actualPrice = joblinePrice; //Set to the jobline as a fallback. } if (!hasNormalizedActualCost && actualCost > 0 && (actualCost < joblinePrice * (1 - PRICE_PERCENT_MARGIN_TOLERANCE) || actualCost > joblinePrice * (1 + PRICE_PERCENT_MARGIN_TOLERANCE))) { actualCost = null //Blank it out if it's not likely. } } } const responsibilityCenters = job.bodyshop.md_responsibility_centers //TODO: Do we need to verify the lines to see if it is a unit price or total price (i.e. quantity * price) const lineObject = { "line_desc": matchToUse?.item?.line_desc || textractLineItem.ITEM?.value || "NO DESCRIPTION", "quantity": textractLineItem.QUANTITY?.value, // convert to integer? "actual_price": normalizePrice(actualPrice), "actual_cost": normalizePrice(actualCost), "cost_center": matchToUse?.item?.part_type ? bodyshopHasDmsKey(job.bodyshop) ? matchToUse?.item?.part_type !== "PAE" ? matchToUse?.item?.part_type : null : responsibilityCenters.defaults && (responsibilityCenters.defaults.costs[matchToUse?.item?.part_type] || null) : null, //Needs to get set by client side. "applicable_taxes": { //Not sure what to do with these? "federal": false, "state": false, "local": false }, "joblineid": matchToUse?.item?.id || "noline", "confidence": `T${overallConfidence} - O${ocrConfidence} - J${matchConfidence}` } return lineObject }) } return billFormData } function joblineFuzzySearch({ fuseToSearch, processedData }) { const matches = [] const searchStats = []; // Track search statistics processedData.lineItems.forEach((lineItem, lineIndex) => { const lineStats = { lineNumber: lineIndex + 1, searches: [] }; // Refined ITEM search (multi-word description) const refinedItemResults = (() => { if (!lineItem.ITEM?.value) return []; const itemValue = lineItem.ITEM.value; const normalized = normalizeText(itemValue); // 1: Full string search const fullSearch = fuseToSearch.search(normalized); lineStats.searches.push({ type: 'ITEM - Full String', term: normalized, results: fullSearch.length }); // 2: Search individual significant words (3+ chars) const words = normalized.split(' ').filter(w => w.length >= 3); const wordSearches = words.map(word => { const results = fuseToSearch.search(word); lineStats.searches.push({ type: 'ITEM - Individual Word', term: word, results: results.length }); return results; }); // 3: Search without spaces entirely const noSpaceSearch = fuseToSearch.search(normalized.replace(/\s+/g, '')); lineStats.searches.push({ type: 'ITEM - No Spaces', term: normalized.replace(/\s+/g, ''), results: noSpaceSearch.length }); // Merge results with weights (full search weighted higher) return mergeResults( [fullSearch, ...wordSearches, noSpaceSearch], [1.0, ...words.map(() => 1.5), 1.2] // Full search best, individual words penalized slightly ); })(); // Refined PRODUCT_CODE search (part numbers) const refinedProductCodeResults = (() => { if (!lineItem.PRODUCT_CODE?.value) return []; const productCode = lineItem.PRODUCT_CODE.value; const normalized = normalizePartNumber(productCode); // 1: Normalized search (no spaces/special chars) const normalizedSearch = fuseToSearch.search(normalized); lineStats.searches.push({ type: 'PRODUCT_CODE - Normalized', term: normalized, results: normalizedSearch.length }); // 2: Original with minimal cleaning const minimalClean = productCode.replace(/\s+/g, '').toUpperCase(); const minimalSearch = fuseToSearch.search(minimalClean); lineStats.searches.push({ type: 'PRODUCT_CODE - Minimal Clean', term: minimalClean, results: minimalSearch.length }); // 3: Search with dashes (common in part numbers) const withDashes = productCode.replace(/[^a-zA-Z0-9-]/g, '').toUpperCase(); const dashSearch = fuseToSearch.search(withDashes); lineStats.searches.push({ type: 'PRODUCT_CODE - With Dashes', term: withDashes, results: dashSearch.length }); // 4: Special chars to spaces (preserve word boundaries) const specialCharsToSpaces = productCode.replace(/[^a-zA-Z0-9\s]/g, ' ').replace(/\s+/g, ' ').trim().toUpperCase(); const specialCharsSearch = fuseToSearch.search(specialCharsToSpaces); lineStats.searches.push({ type: 'PRODUCT_CODE - Special Chars to Spaces', term: specialCharsToSpaces, results: specialCharsSearch.length }); return mergeResults( [normalizedSearch, minimalSearch, dashSearch, specialCharsSearch], [1.0, 1.1, 1.2, 1.15] // Prefer fully normalized, special chars to spaces slightly weighted ); })(); // Refined PRICE search const refinedPriceResults = (() => { if (!lineItem.PRICE?.value) return []; const price = normalizePrice(lineItem.PRICE.value); // 1: Exact price match const exactSearch = fuseToSearch.search(price); lineStats.searches.push({ type: 'PRICE - Exact', term: price, results: exactSearch.length }); // 2: Price with 2 decimal places const priceFloat = parseFloat(price); if (!isNaN(priceFloat)) { const formattedPrice = priceFloat.toFixed(2); const formattedSearch = fuseToSearch.search(formattedPrice); lineStats.searches.push({ type: 'PRICE - Formatted (2 decimals)', term: formattedPrice, results: formattedSearch.length }); return mergeResults([exactSearch, formattedSearch], [1.0, 1.1]); } return exactSearch; })(); // Refined UNIT_PRICE search const refinedUnitPriceResults = (() => { if (!lineItem.UNIT_PRICE?.value) return []; const unitPrice = normalizePrice(lineItem.UNIT_PRICE.value); // 1: Exact price match const exactSearch = fuseToSearch.search(unitPrice); lineStats.searches.push({ type: 'UNIT_PRICE - Exact', term: unitPrice, results: exactSearch.length }); // 2: Price with 2 decimal places const priceFloat = parseFloat(unitPrice); if (!isNaN(priceFloat)) { const formattedPrice = priceFloat.toFixed(2); const formattedSearch = fuseToSearch.search(formattedPrice); lineStats.searches.push({ type: 'UNIT_PRICE - Formatted (2 decimals)', term: formattedPrice, results: formattedSearch.length }); return mergeResults([exactSearch, formattedSearch], [1.0, 1.1]); } return exactSearch; })(); //Merge them all together and sort by the highest scores. const combinedScoreMap = new Map(); // Weight different field types differently const fieldWeights = { productCode: 5.0, // Most important - part numbers should match item: 3.0, // Second most important - description price: 1.0, // Less important - prices can vary unitPrice: 0.8 // Least important - similar to price }; [ { results: refinedProductCodeResults, weight: fieldWeights.productCode, field: 'productCode' }, { results: refinedItemResults, weight: fieldWeights.item, field: 'item' }, { results: refinedPriceResults, weight: fieldWeights.price, field: 'price' }, { results: refinedUnitPriceResults, weight: fieldWeights.unitPrice, field: 'unitPrice' } ].forEach(({ results, weight, field }) => { results.forEach((result, index) => { const id = result.item.id; // Position bonus (first result is better than fifth) const positionBonus = (5 - index) / 5; // Lower score is better in Fuse.js, so invert it and apply weights const normalizedScore = (1 - result.score) * weight * positionBonus; if (!combinedScoreMap.has(id)) { combinedScoreMap.set(id, { item: result.item, score: normalizedScore, fieldMatches: [field], matchCount: result.count || 1 }); } else { const existing = combinedScoreMap.get(id); existing.score += normalizedScore; existing.fieldMatches.push(field); existing.matchCount += (result.count || 1); } }); }); // Convert to array and sort by best combined score const finalMatches = Array.from(combinedScoreMap.values()) .map(entry => { // Apply penalty if item has no act_price or it's 0 const hasPriceData = entry.item.act_price && parseFloat(entry.item.act_price) > 0; const priceDataPenalty = hasPriceData ? 1.0 : 0.5; // 50% penalty if no price return { ...entry, // Boost score for items that matched in multiple fields, penalize for missing price finalScore: entry.score * (1 + (entry.fieldMatches.length * 0.2)) * priceDataPenalty, hasPriceData }; }) .sort((a, b) => b.finalScore - a.finalScore) .slice(0, 5); // Always push the textract line item, even if no matches found // This ensures all invoice lines are processed matches.push({ matches: finalMatches, textractLineItem: lineItem, hasMatch: finalMatches.length > 0 }); searchStats.push(lineStats); }) // Output search statistics table console.log('\n═══════════════════════════════════════════════════════════════════════'); console.log(' FUSE.JS SEARCH STATISTICS'); console.log('═══════════════════════════════════════════════════════════════════════\n'); searchStats.forEach(lineStat => { console.log(`📄 Line Item #${lineStat.lineNumber}:`); console.log('─'.repeat(75)); if (lineStat.searches.length > 0) { const tableData = lineStat.searches.map(search => ({ 'Search Type': search.type, 'Search Term': search.term.substring(0, 40) + (search.term.length > 40 ? '...' : ''), 'Results': search.results })); console.table(tableData); } else { console.log(' No searches performed for this line item.\n'); } }); // Summary statistics const totalSearches = searchStats.reduce((sum, stat) => sum + stat.searches.length, 0); const totalResults = searchStats.reduce((sum, stat) => sum + stat.searches.reduce((s, search) => s + search.results, 0), 0); const avgResultsPerSearch = totalSearches > 0 ? (totalResults / totalSearches).toFixed(2) : 0; console.log('═══════════════════════════════════════════════════════════════════════'); console.log(' SUMMARY'); console.log('═══════════════════════════════════════════════════════════════════════'); console.table({ 'Total Line Items': processedData.lineItems.length, 'Total Searches Performed': totalSearches, 'Total Results Found': totalResults, 'Average Results per Search': avgResultsPerSearch }); console.log('═══════════════════════════════════════════════════════════════════════\n'); return matches } const bodyshopHasDmsKey = (bodyshop) => bodyshop.cdk_dealerid || bodyshop.pbs_serialnumber || bodyshop.rr_dealerid; module.exports = { generateBillFormData }