const MIN_CONFIDENCE_VALUE = 50 function normalizeFieldName(fieldType) { //Placeholder normalization for now. return fieldType; } const standardizedFieldsnames = { actual_cost: "actual_cost", actual_price: "actual_price", line_desc: "line_desc", quantity: "quantity", part_no: "part_no", ro_number: "ro_number", } function normalizeLabelName(labelText) { if (!labelText) return ''; // Convert to lowercase and trim whitespace let normalized = labelText.toLowerCase().trim(); // Remove special characters and replace spaces with underscores normalized = normalized.replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, '_'); // Common label normalizations const labelMap = { 'qty': standardizedFieldsnames.quantity, 'qnty': standardizedFieldsnames.quantity, 'sale_qty': standardizedFieldsnames.quantity, 'invoiced_qty': standardizedFieldsnames.quantity, 'qty_shipped': standardizedFieldsnames.quantity, 'quantity': standardizedFieldsnames.quantity, 'filled': standardizedFieldsnames.quantity, 'count': standardizedFieldsnames.quantity, 'quant': standardizedFieldsnames.quantity, 'desc': standardizedFieldsnames.line_desc, 'description': standardizedFieldsnames.line_desc, 'item': standardizedFieldsnames.line_desc, 'part': standardizedFieldsnames.part_no, 'part_no': standardizedFieldsnames.part_no, 'part_num': standardizedFieldsnames.part_no, 'part_number': standardizedFieldsnames.part_no, 'item_no': standardizedFieldsnames.part_no, 'price': standardizedFieldsnames.actual_price, //'amount': standardizedFieldsnames.actual_price, 'list_price': standardizedFieldsnames.actual_price, 'unit_price': standardizedFieldsnames.actual_price, 'list': standardizedFieldsnames.actual_price, 'retail_price': standardizedFieldsnames.actual_price, 'retail': standardizedFieldsnames.actual_price, 'net': standardizedFieldsnames.actual_cost, 'selling_price': standardizedFieldsnames.actual_cost, 'net_price': standardizedFieldsnames.actual_cost, 'net_cost': standardizedFieldsnames.actual_cost, 'total': standardizedFieldsnames.actual_cost, 'po_no': standardizedFieldsnames.ro_number, 'customer_po_no': standardizedFieldsnames.ro_number, 'customer_po_no_': standardizedFieldsnames.ro_number }; return labelMap[normalized] || `NOT_MAPPED => ${normalized}`; // TODO: Should we monitor unmapped labels? } function processScanData(invoiceData) { // Process and clean the extracted data const processed = { summary: {}, lineItems: [] }; // Clean summary fields for (const [key, value] of Object.entries(invoiceData.summary)) { if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence processed.summary[key] = { value: value.value, label: value.label, normalizedLabel: value.normalizedLabel, confidence: value.confidence }; } } // Process line items processed.lineItems = invoiceData.lineItems .map(item => { const processedItem = {}; for (const [key, value] of Object.entries(item)) { if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence let cleanValue = value.value; // Parse numbers for quantity and price fields if (key === 'quantity') { cleanValue = parseFloat(cleanValue) || 0; } else if (key === 'retail_price' || key === 'actual_price') { // Remove currency symbols and parse cleanValue = parseFloat(cleanValue.replace(/[^0-9.-]/g, '')) || 0; } processedItem[key] = { value: cleanValue, label: value.label, normalizedLabel: value.normalizedLabel, confidence: value.confidence }; } } return processedItem; }) return processed; } function extractInvoiceData(textractResponse) { const invoiceData = { summary: {}, lineItems: [] }; if (!textractResponse.ExpenseDocuments || textractResponse.ExpenseDocuments.length === 0) { return invoiceData; } // Process each page of the invoice textractResponse.ExpenseDocuments.forEach(expenseDoc => { // Extract summary fields (vendor, invoice number, date, total, etc.) if (expenseDoc.SummaryFields) { expenseDoc.SummaryFields.forEach(field => { const fieldType = field.Type?.Text || ''; const fieldValue = field.ValueDetection?.Text || ''; const fieldLabel = field.LabelDetection?.Text || ''; const confidence = field.ValueDetection?.Confidence || 0; // Map common invoice fields if (fieldType && fieldValue) { invoiceData.summary[fieldType] = { value: fieldValue, label: fieldLabel, normalizedLabel: normalizeLabelName(fieldLabel), confidence: confidence }; } }); } // Extract line items if (expenseDoc.LineItemGroups) { expenseDoc.LineItemGroups.forEach(lineItemGroup => { if (lineItemGroup.LineItems) { lineItemGroup.LineItems.forEach(lineItem => { const item = {}; const fieldNameCounts = {}; // Track field name occurrences if (lineItem.LineItemExpenseFields) { lineItem.LineItemExpenseFields.forEach(field => { const fieldType = field.Type?.Text || ''; const fieldValue = field.ValueDetection?.Text || ''; const fieldLabel = field.LabelDetection?.Text || ''; const confidence = field.ValueDetection?.Confidence || 0; if (fieldType && fieldValue) { // Normalize field names let normalizedField = normalizeFieldName(fieldType); // Ensure uniqueness by appending a counter if the field already exists if (Object.prototype.hasOwnProperty.call(item, normalizedField)) { fieldNameCounts[normalizedField] = (fieldNameCounts[normalizedField] || 1) + 1; normalizedField = `${normalizedField}_${fieldNameCounts[normalizedField]}`; } item[normalizedField] = { value: fieldValue, label: fieldLabel, normalizedLabel: normalizeLabelName(fieldLabel), confidence: confidence }; } }); } if (Object.keys(item).length > 0) { invoiceData.lineItems.push(item); } }); } }); } }); return invoiceData; } module.exports = { extractInvoiceData, processScanData, standardizedFieldsnames }