Files
bodyshop/server/ai/bill-ocr/bill-ocr-normalize.js

184 lines
6.8 KiB
JavaScript

const MIN_CONFIDENCE_VALUE = 50
function normalizeFieldName(fieldType) {
//Placeholder normalization for now.
return fieldType;
}
function normalizeLabelName(labelText) {
if (!labelText) return '';
// Convert to lowercase and trim whitespace
let normalized = labelText.toLowerCase().trim();
// Remove special characters and replace spaces with underscores
normalized = normalized.replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, '_');
const standardizedFieldsnames = {
actual_cost: "actual_cost",
actual_price: "actual_price",
line_desc: "line_desc",
quantity: "quantity",
part_no: "part_no"
}
// Common label normalizations
const labelMap = {
'qty': standardizedFieldsnames.quantity,
'qnty': standardizedFieldsnames.quantity,
'sale_qty': standardizedFieldsnames.quantity,
'invoiced_qty': standardizedFieldsnames.quantity,
'qty_shipped': standardizedFieldsnames.quantity,
'quant': standardizedFieldsnames.quantity,
'desc': standardizedFieldsnames.line_desc,
'description': standardizedFieldsnames.line_desc,
'item': standardizedFieldsnames.line_desc,
'part': standardizedFieldsnames.part_no,
'part_no': standardizedFieldsnames.part_no,
'part_num': standardizedFieldsnames.part_no,
'part_number': standardizedFieldsnames.part_no,
'price': standardizedFieldsnames.actual_price,
'unit_price': standardizedFieldsnames.actual_price,
'amount': standardizedFieldsnames.actual_price,
'list_price': standardizedFieldsnames.actual_price,
'list': standardizedFieldsnames.actual_price,
'retail_price': standardizedFieldsnames.actual_price,
'net': standardizedFieldsnames.actual_cost,
'selling_price': standardizedFieldsnames.actual_cost,
};
return labelMap[normalized] || `UNKNOWN_${normalized}`; // TODO: Should we monitor unmapped labels?
}
function processScanData(invoiceData) {
// Process and clean the extracted data
const processed = {
summary: {},
lineItems: []
};
// Clean summary fields
for (const [key, value] of Object.entries(invoiceData.summary)) {
if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
processed.summary[key] = {
value: value.value,
label: value.label,
normalizedLabel: value.normalizedLabel,
confidence: value.confidence
};
}
}
// Process line items
processed.lineItems = invoiceData.lineItems
.map(item => {
const processedItem = {};
for (const [key, value] of Object.entries(item)) {
if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
let cleanValue = value.value;
// Parse numbers for quantity and price fields
if (key === 'quantity') {
cleanValue = parseFloat(cleanValue) || 0;
} else if (key === 'retail_price' || key === 'actual_price') {
// Remove currency symbols and parse
cleanValue = parseFloat(cleanValue.replace(/[^0-9.-]/g, '')) || 0;
}
processedItem[key] = {
value: cleanValue,
label: value.label,
normalizedLabel: value.normalizedLabel,
confidence: value.confidence
};
}
}
return processedItem;
})
// .filter(item => {
// // Filter out items with no description or with quantity <= 0
// return item.description && (!item.quantity || item.quantity > 0);
// });
return processed;
}
function extractInvoiceData(textractResponse) {
const invoiceData = {
summary: {},
lineItems: []
};
if (!textractResponse.ExpenseDocuments || textractResponse.ExpenseDocuments.length === 0) {
return invoiceData;
}
// Process each page of the invoice
textractResponse.ExpenseDocuments.forEach(expenseDoc => {
// Extract summary fields (vendor, invoice number, date, total, etc.)
if (expenseDoc.SummaryFields) {
expenseDoc.SummaryFields.forEach(field => {
const fieldType = field.Type?.Text || '';
const fieldValue = field.ValueDetection?.Text || '';
const fieldLabel = field.LabelDetection?.Text || '';
const confidence = field.ValueDetection?.Confidence || 0;
// Map common invoice fields
if (fieldType && fieldValue) {
invoiceData.summary[fieldType] = {
value: fieldValue,
label: fieldLabel,
normalizedLabel: normalizeLabelName(fieldLabel),
confidence: confidence
};
}
});
}
// Extract line items
if (expenseDoc.LineItemGroups) {
expenseDoc.LineItemGroups.forEach(lineItemGroup => {
if (lineItemGroup.LineItems) {
lineItemGroup.LineItems.forEach(lineItem => {
const item = {};
if (lineItem.LineItemExpenseFields) {
lineItem.LineItemExpenseFields.forEach(field => {
const fieldType = field.Type?.Text || '';
const fieldValue = field.ValueDetection?.Text || '';
const fieldLabel = field.LabelDetection?.Text || '';
const confidence = field.ValueDetection?.Confidence || 0;
if (fieldType && fieldValue) {
// Normalize field names
const normalizedField = normalizeFieldName(fieldType);
item[normalizedField] = {
value: fieldValue,
label: fieldLabel,
normalizedLabel: normalizeLabelName(fieldLabel),
confidence: confidence
};
}
});
}
if (Object.keys(item).length > 0) {
invoiceData.lineItems.push(item);
}
});
}
});
}
});
return invoiceData;
}
module.exports = {
extractInvoiceData,
processScanData
}