IO-3515 Bill OCR refactor to split files and introduce generator.
This commit is contained in:
10
package-lock.json
generated
10
package-lock.json
generated
@@ -39,6 +39,7 @@
|
||||
"express": "^4.21.1",
|
||||
"fast-xml-parser": "^5.3.3",
|
||||
"firebase-admin": "^13.6.0",
|
||||
"fuse.js": "^7.1.0",
|
||||
"graphql": "^16.12.0",
|
||||
"graphql-request": "^6.1.0",
|
||||
"intuit-oauth": "^4.2.2",
|
||||
@@ -7103,6 +7104,15 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/fuse.js": {
|
||||
"version": "7.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.1.0.tgz",
|
||||
"integrity": "sha512-trLf4SzuuUxfusZADLINj+dE8clK1frKdmqiJNb1Es75fmI5oY6X2mxLVUciLLjxqw/xr72Dhy+lER6dGd02FQ==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/gaxios": {
|
||||
"version": "6.7.1",
|
||||
"resolved": "https://registry.npmjs.org/gaxios/-/gaxios-6.7.1.tgz",
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
"express": "^4.21.1",
|
||||
"fast-xml-parser": "^5.3.3",
|
||||
"firebase-admin": "^13.6.0",
|
||||
"fuse.js": "^7.1.0",
|
||||
"graphql": "^16.12.0",
|
||||
"graphql-request": "^6.1.0",
|
||||
"intuit-oauth": "^4.2.2",
|
||||
|
||||
3346
server/ai/bill-ocr/bill-ocr-generator.js
Normal file
3346
server/ai/bill-ocr/bill-ocr-generator.js
Normal file
File diff suppressed because it is too large
Load Diff
159
server/ai/bill-ocr/bill-ocr-helpers.js
Normal file
159
server/ai/bill-ocr/bill-ocr-helpers.js
Normal file
@@ -0,0 +1,159 @@
|
||||
const PDFDocument = require('pdf-lib').PDFDocument;
|
||||
|
||||
const TEXTRACT_REDIS_PREFIX = "textract:"
|
||||
const TEXTRACT_JOB_TTL = 3600;
|
||||
|
||||
|
||||
/**
|
||||
* Generate Redis key for Textract job using textract job ID
|
||||
* @param {string} textractJobId
|
||||
* @returns {string}
|
||||
*/
|
||||
function getTextractJobKey(textractJobId) {
|
||||
return `${TEXTRACT_REDIS_PREFIX}:${textractJobId}`;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Store Textract job data in Redis
|
||||
* @param {string} textractJobId
|
||||
* @param {Object} redisPubClient
|
||||
* @param {Object} jobData
|
||||
*/
|
||||
async function setTextractJob({ redisPubClient, textractJobId, jobData }) {
|
||||
if (!redisPubClient) {
|
||||
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
|
||||
}
|
||||
const key = getTextractJobKey(textractJobId);
|
||||
await redisPubClient.set(key, JSON.stringify(jobData));
|
||||
await redisPubClient.expire(key, TEXTRACT_JOB_TTL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve Textract job data from Redis
|
||||
* @param {string} textractJobId
|
||||
* @param {Object} redisPubClient
|
||||
* @returns {Promise<Object|null>}
|
||||
*/
|
||||
async function getTextractJob({ redisPubClient, textractJobId }) {
|
||||
if (!redisPubClient) {
|
||||
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
|
||||
}
|
||||
const key = getTextractJobKey(textractJobId);
|
||||
const data = await redisPubClient.get(key);
|
||||
return data ? JSON.parse(data) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect file type based on MIME type and file signature
|
||||
* @param {Object} file - Multer file object
|
||||
* @returns {string} 'pdf', 'image', or 'unknown'
|
||||
*/
|
||||
function getFileType(file) {
|
||||
// Check MIME type first
|
||||
const mimeType = file.mimetype?.toLowerCase();
|
||||
|
||||
if (mimeType === 'application/pdf') {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
if (mimeType && mimeType.startsWith('image/')) {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
// Fallback: Check file signature (magic bytes)
|
||||
const buffer = file.buffer;
|
||||
if (buffer && buffer.length > 4) {
|
||||
// PDF signature: %PDF
|
||||
if (buffer[0] === 0x25 && buffer[1] === 0x50 && buffer[2] === 0x44 && buffer[3] === 0x46) {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
// JPEG signature: FF D8 FF
|
||||
if (buffer[0] === 0xFF && buffer[1] === 0xD8 && buffer[2] === 0xFF) {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
// PNG signature: 89 50 4E 47
|
||||
if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4E && buffer[3] === 0x47) {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
// HEIC/HEIF: Check for ftyp followed by heic/heix/hevc/hevx
|
||||
if (buffer.length > 12) {
|
||||
const ftypIndex = buffer.indexOf(Buffer.from('ftyp'));
|
||||
if (ftypIndex > 0 && ftypIndex < 12) {
|
||||
const brand = buffer.slice(ftypIndex + 4, ftypIndex + 8).toString('ascii');
|
||||
if (brand.startsWith('heic') || brand.startsWith('heix') ||
|
||||
brand.startsWith('hevc') || brand.startsWith('hevx') ||
|
||||
brand.startsWith('mif1')) {
|
||||
return 'image';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of pages in a PDF buffer
|
||||
* @param {Buffer} pdfBuffer
|
||||
* @returns {Promise<number>}
|
||||
*/
|
||||
async function getPdfPageCount(pdfBuffer) {
|
||||
try {
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
return pdfDoc.getPageCount();
|
||||
} catch (error) {
|
||||
console.error('Error reading PDF page count:', error);
|
||||
throw new Error('Failed to read PDF: ' + error.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if there are any jobs in IN_PROGRESS status
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async function hasActiveJobs({ redisPubClient }) {
|
||||
if (!redisPubClient) {
|
||||
throw new Error('Redis client not initialized.');
|
||||
}
|
||||
|
||||
try {
|
||||
// Get all textract job keys
|
||||
const pattern = `${TEXTRACT_REDIS_PREFIX}:*`;
|
||||
const keys = await redisPubClient.keys(pattern);
|
||||
|
||||
if (!keys || keys.length === 0) {
|
||||
return false;
|
||||
}
|
||||
//TODO: Is there a better way to do this that supports clusters?
|
||||
// Check if any job has IN_PROGRESS status
|
||||
for (const key of keys) {
|
||||
const data = await redisPubClient.get(key);
|
||||
if (data) {
|
||||
const jobData = JSON.parse(data);
|
||||
if (jobData.status === 'IN_PROGRESS') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.error('Error checking for active jobs:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getTextractJobKey,
|
||||
setTextractJob,
|
||||
getTextractJob,
|
||||
getFileType,
|
||||
getPdfPageCount,
|
||||
hasActiveJobs,
|
||||
TEXTRACT_REDIS_PREFIX
|
||||
}
|
||||
|
||||
184
server/ai/bill-ocr/bill-ocr-normalize.js
Normal file
184
server/ai/bill-ocr/bill-ocr-normalize.js
Normal file
@@ -0,0 +1,184 @@
|
||||
|
||||
const MIN_CONFIDENCE_VALUE = 50
|
||||
|
||||
function normalizeFieldName(fieldType) {
|
||||
//Placeholder normalization for now.
|
||||
return fieldType;
|
||||
}
|
||||
|
||||
|
||||
function normalizeLabelName(labelText) {
|
||||
if (!labelText) return '';
|
||||
|
||||
// Convert to lowercase and trim whitespace
|
||||
let normalized = labelText.toLowerCase().trim();
|
||||
|
||||
// Remove special characters and replace spaces with underscores
|
||||
normalized = normalized.replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, '_');
|
||||
const standardizedFieldsnames = {
|
||||
actual_cost: "actual_cost",
|
||||
actual_price: "actual_price",
|
||||
line_desc: "line_desc",
|
||||
quantity: "quantity",
|
||||
part_no: "part_no"
|
||||
}
|
||||
|
||||
// Common label normalizations
|
||||
const labelMap = {
|
||||
'qty': standardizedFieldsnames.quantity,
|
||||
'qnty': standardizedFieldsnames.quantity,
|
||||
'sale_qty': standardizedFieldsnames.quantity,
|
||||
'invoiced_qty': standardizedFieldsnames.quantity,
|
||||
'qty_shipped': standardizedFieldsnames.quantity,
|
||||
'quant': standardizedFieldsnames.quantity,
|
||||
'desc': standardizedFieldsnames.line_desc,
|
||||
'description': standardizedFieldsnames.line_desc,
|
||||
'item': standardizedFieldsnames.line_desc,
|
||||
'part': standardizedFieldsnames.part_no,
|
||||
'part_no': standardizedFieldsnames.part_no,
|
||||
'part_num': standardizedFieldsnames.part_no,
|
||||
'part_number': standardizedFieldsnames.part_no,
|
||||
'price': standardizedFieldsnames.actual_price,
|
||||
'unit_price': standardizedFieldsnames.actual_price,
|
||||
'amount': standardizedFieldsnames.actual_price,
|
||||
'list_price': standardizedFieldsnames.actual_price,
|
||||
'list': standardizedFieldsnames.actual_price,
|
||||
'retail_price': standardizedFieldsnames.actual_price,
|
||||
'net': standardizedFieldsnames.actual_cost,
|
||||
'selling_price': standardizedFieldsnames.actual_cost,
|
||||
|
||||
};
|
||||
|
||||
return labelMap[normalized] || `UNKNOWN_${normalized}`; // TODO: Should we monitor unmapped labels?
|
||||
}
|
||||
|
||||
function processScanData(invoiceData) {
|
||||
// Process and clean the extracted data
|
||||
const processed = {
|
||||
summary: {},
|
||||
lineItems: []
|
||||
};
|
||||
|
||||
// Clean summary fields
|
||||
for (const [key, value] of Object.entries(invoiceData.summary)) {
|
||||
if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
|
||||
processed.summary[key] = {
|
||||
value: value.value,
|
||||
label: value.label,
|
||||
normalizedLabel: value.normalizedLabel,
|
||||
confidence: value.confidence
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Process line items
|
||||
processed.lineItems = invoiceData.lineItems
|
||||
.map(item => {
|
||||
const processedItem = {};
|
||||
|
||||
for (const [key, value] of Object.entries(item)) {
|
||||
if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
|
||||
let cleanValue = value.value;
|
||||
|
||||
// Parse numbers for quantity and price fields
|
||||
if (key === 'quantity') {
|
||||
cleanValue = parseFloat(cleanValue) || 0;
|
||||
} else if (key === 'retail_price' || key === 'actual_price') {
|
||||
// Remove currency symbols and parse
|
||||
cleanValue = parseFloat(cleanValue.replace(/[^0-9.-]/g, '')) || 0;
|
||||
}
|
||||
|
||||
processedItem[key] = {
|
||||
value: cleanValue,
|
||||
label: value.label,
|
||||
normalizedLabel: value.normalizedLabel,
|
||||
confidence: value.confidence
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return processedItem;
|
||||
})
|
||||
// .filter(item => {
|
||||
// // Filter out items with no description or with quantity <= 0
|
||||
// return item.description && (!item.quantity || item.quantity > 0);
|
||||
// });
|
||||
|
||||
return processed;
|
||||
}
|
||||
|
||||
function extractInvoiceData(textractResponse) {
|
||||
const invoiceData = {
|
||||
summary: {},
|
||||
lineItems: []
|
||||
};
|
||||
|
||||
if (!textractResponse.ExpenseDocuments || textractResponse.ExpenseDocuments.length === 0) {
|
||||
return invoiceData;
|
||||
}
|
||||
|
||||
// Process each page of the invoice
|
||||
textractResponse.ExpenseDocuments.forEach(expenseDoc => {
|
||||
// Extract summary fields (vendor, invoice number, date, total, etc.)
|
||||
if (expenseDoc.SummaryFields) {
|
||||
expenseDoc.SummaryFields.forEach(field => {
|
||||
const fieldType = field.Type?.Text || '';
|
||||
const fieldValue = field.ValueDetection?.Text || '';
|
||||
const fieldLabel = field.LabelDetection?.Text || '';
|
||||
const confidence = field.ValueDetection?.Confidence || 0;
|
||||
|
||||
// Map common invoice fields
|
||||
if (fieldType && fieldValue) {
|
||||
invoiceData.summary[fieldType] = {
|
||||
value: fieldValue,
|
||||
label: fieldLabel,
|
||||
normalizedLabel: normalizeLabelName(fieldLabel),
|
||||
confidence: confidence
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Extract line items
|
||||
if (expenseDoc.LineItemGroups) {
|
||||
expenseDoc.LineItemGroups.forEach(lineItemGroup => {
|
||||
if (lineItemGroup.LineItems) {
|
||||
lineItemGroup.LineItems.forEach(lineItem => {
|
||||
const item = {};
|
||||
|
||||
if (lineItem.LineItemExpenseFields) {
|
||||
lineItem.LineItemExpenseFields.forEach(field => {
|
||||
const fieldType = field.Type?.Text || '';
|
||||
const fieldValue = field.ValueDetection?.Text || '';
|
||||
const fieldLabel = field.LabelDetection?.Text || '';
|
||||
const confidence = field.ValueDetection?.Confidence || 0;
|
||||
|
||||
if (fieldType && fieldValue) {
|
||||
// Normalize field names
|
||||
const normalizedField = normalizeFieldName(fieldType);
|
||||
item[normalizedField] = {
|
||||
value: fieldValue,
|
||||
label: fieldLabel,
|
||||
normalizedLabel: normalizeLabelName(fieldLabel),
|
||||
confidence: confidence
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (Object.keys(item).length > 0) {
|
||||
invoiceData.lineItems.push(item);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return invoiceData;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
extractInvoiceData,
|
||||
processScanData
|
||||
}
|
||||
@@ -2,7 +2,9 @@ const { TextractClient, StartExpenseAnalysisCommand, GetExpenseAnalysisCommand,
|
||||
const { S3Client, PutObjectCommand } = require("@aws-sdk/client-s3");
|
||||
const { SQSClient, ReceiveMessageCommand, DeleteMessageCommand } = require("@aws-sdk/client-sqs");
|
||||
const { v4: uuidv4 } = require('uuid');
|
||||
const PDFDocument = require('pdf-lib').PDFDocument;
|
||||
const { getTextractJobKey, setTextractJob, getTextractJob, getFileType, getPdfPageCount, hasActiveJobs } = require("./bill-ocr-helpers");
|
||||
const { extractInvoiceData, processScanData } = require("./bill-ocr-normalize");
|
||||
const { generateBillFormData } = require("./bill-ocr-generator");
|
||||
|
||||
// Initialize AWS clients
|
||||
const awsConfig = {
|
||||
@@ -18,8 +20,7 @@ const s3Client = new S3Client(awsConfig);
|
||||
const sqsClient = new SQSClient(awsConfig);
|
||||
|
||||
let redisPubClient = null;
|
||||
const TEXTRACT_JOB_TTL = 3600;
|
||||
const MIN_CONFIDENCE_VALUE = 50
|
||||
|
||||
|
||||
/**
|
||||
* Initialize the bill-ocr module with Redis client
|
||||
@@ -29,43 +30,6 @@ function initializeBillOcr(pubClient) {
|
||||
redisPubClient = pubClient;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate Redis key for Textract job using textract job ID
|
||||
* @param {string} textractJobId
|
||||
* @returns {string}
|
||||
*/
|
||||
function getTextractJobKey(textractJobId) {
|
||||
return `textract:job:${textractJobId}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store Textract job data in Redis
|
||||
* @param {string} textractJobId
|
||||
* @param {Object} jobData
|
||||
*/
|
||||
async function setTextractJob(textractJobId, jobData) {
|
||||
if (!redisPubClient) {
|
||||
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
|
||||
}
|
||||
const key = getTextractJobKey(textractJobId);
|
||||
await redisPubClient.set(key, JSON.stringify(jobData));
|
||||
await redisPubClient.expire(key, TEXTRACT_JOB_TTL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve Textract job data from Redis
|
||||
* @param {string} textractJobId
|
||||
* @returns {Promise<Object|null>}
|
||||
*/
|
||||
async function getTextractJob(textractJobId) {
|
||||
if (!redisPubClient) {
|
||||
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
|
||||
}
|
||||
const key = getTextractJobKey(textractJobId);
|
||||
const data = await redisPubClient.get(key);
|
||||
return data ? JSON.parse(data) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if job exists by Textract job ID
|
||||
* @param {string} textractJobId
|
||||
@@ -89,42 +53,6 @@ async function jobExists(textractJobId) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if there are any jobs in IN_PROGRESS status
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async function hasActiveJobs() {
|
||||
if (!redisPubClient) {
|
||||
throw new Error('Redis client not initialized.');
|
||||
}
|
||||
|
||||
try {
|
||||
// Get all textract job keys
|
||||
const pattern = 'textract:job:*';
|
||||
const keys = await redisPubClient.keys(pattern);
|
||||
|
||||
if (!keys || keys.length === 0) {
|
||||
return false;
|
||||
}
|
||||
//TODO: Is there a better way to do this that supports clusters?
|
||||
// Check if any job has IN_PROGRESS status
|
||||
for (const key of keys) {
|
||||
const data = await redisPubClient.get(key);
|
||||
if (data) {
|
||||
const jobData = JSON.parse(data);
|
||||
if (jobData.status === 'IN_PROGRESS') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.error('Error checking for active jobs:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function handleBillOcr(request, response) {
|
||||
// Check if file was uploaded
|
||||
if (!request.file) {
|
||||
@@ -134,6 +62,7 @@ async function handleBillOcr(request, response) {
|
||||
|
||||
// The uploaded file is available in request.file
|
||||
const uploadedFile = request.file;
|
||||
const { jobid, bodyshopid, parts_orderid } = request.body;
|
||||
|
||||
try {
|
||||
const fileType = getFileType(uploadedFile);
|
||||
@@ -160,10 +89,11 @@ async function handleBillOcr(request, response) {
|
||||
console.log('PDF => 1 page, processing synchronously');
|
||||
const result = await processSinglePageDocument(uploadedFile.buffer);
|
||||
|
||||
//const billResult = await generateBillFormData({ result, });
|
||||
response.status(200).send({
|
||||
success: true,
|
||||
status: 'COMPLETED',
|
||||
data: result,
|
||||
data: { result, },
|
||||
message: 'Invoice processing completed'
|
||||
});
|
||||
} else {
|
||||
@@ -194,17 +124,8 @@ async function handleBillOcr(request, response) {
|
||||
}
|
||||
|
||||
async function handleBillOcrStatus(request, response) {
|
||||
console.log('handleBillOcrStatus called');
|
||||
console.log('request.params:', request.params);
|
||||
console.log('request.query:', request.query);
|
||||
|
||||
|
||||
|
||||
|
||||
const { jobId: textractJobId } = request.params;
|
||||
|
||||
|
||||
|
||||
if (!textractJobId) {
|
||||
console.log('No textractJobId found in params');
|
||||
response.status(400).send({ error: 'Job ID is required' });
|
||||
@@ -212,7 +133,7 @@ async function handleBillOcrStatus(request, response) {
|
||||
}
|
||||
|
||||
console.log('Looking for job:', textractJobId);
|
||||
const jobStatus = await getTextractJob(textractJobId);
|
||||
const jobStatus = await getTextractJob({ redisPubClient, textractJobId });
|
||||
console.log('Job status:', jobStatus);
|
||||
|
||||
if (!jobStatus) {
|
||||
@@ -237,72 +158,8 @@ async function handleBillOcrStatus(request, response) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect file type based on MIME type and file signature
|
||||
* @param {Object} file - Multer file object
|
||||
* @returns {string} 'pdf', 'image', or 'unknown'
|
||||
*/
|
||||
function getFileType(file) {
|
||||
// Check MIME type first
|
||||
const mimeType = file.mimetype?.toLowerCase();
|
||||
|
||||
if (mimeType === 'application/pdf') {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
if (mimeType && mimeType.startsWith('image/')) {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
// Fallback: Check file signature (magic bytes)
|
||||
const buffer = file.buffer;
|
||||
if (buffer && buffer.length > 4) {
|
||||
// PDF signature: %PDF
|
||||
if (buffer[0] === 0x25 && buffer[1] === 0x50 && buffer[2] === 0x44 && buffer[3] === 0x46) {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
// JPEG signature: FF D8 FF
|
||||
if (buffer[0] === 0xFF && buffer[1] === 0xD8 && buffer[2] === 0xFF) {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
// PNG signature: 89 50 4E 47
|
||||
if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4E && buffer[3] === 0x47) {
|
||||
return 'image';
|
||||
}
|
||||
|
||||
// HEIC/HEIF: Check for ftyp followed by heic/heix/hevc/hevx
|
||||
if (buffer.length > 12) {
|
||||
const ftypIndex = buffer.indexOf(Buffer.from('ftyp'));
|
||||
if (ftypIndex > 0 && ftypIndex < 12) {
|
||||
const brand = buffer.slice(ftypIndex + 4, ftypIndex + 8).toString('ascii');
|
||||
if (brand.startsWith('heic') || brand.startsWith('heix') ||
|
||||
brand.startsWith('hevc') || brand.startsWith('hevx') ||
|
||||
brand.startsWith('mif1')) {
|
||||
return 'image';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of pages in a PDF buffer
|
||||
* @param {Buffer} pdfBuffer
|
||||
* @returns {Promise<number>}
|
||||
*/
|
||||
async function getPdfPageCount(pdfBuffer) {
|
||||
try {
|
||||
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
||||
return pdfDoc.getPageCount();
|
||||
} catch (error) {
|
||||
console.error('Error reading PDF page count:', error);
|
||||
throw new Error('Failed to read PDF: ' + error.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single-page document synchronously using AnalyzeExpenseCommand
|
||||
@@ -373,12 +230,18 @@ async function startTextractJob(pdfBuffer) {
|
||||
const textractJobId = startResult.JobId;
|
||||
|
||||
// Store job info in Redis using textractJobId as the key
|
||||
await setTextractJob(textractJobId, {
|
||||
status: 'IN_PROGRESS',
|
||||
s3Key: s3Key,
|
||||
uploadId: uploadId,
|
||||
startedAt: new Date().toISOString()
|
||||
});
|
||||
await setTextractJob(
|
||||
{
|
||||
redisPubClient,
|
||||
textractJobId,
|
||||
jobData: {
|
||||
status: 'IN_PROGRESS',
|
||||
s3Key: s3Key,
|
||||
uploadId: uploadId,
|
||||
startedAt: new Date().toISOString()
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
return {
|
||||
jobId: textractJobId
|
||||
@@ -395,7 +258,7 @@ async function processSQSMessages() {
|
||||
}
|
||||
|
||||
// Only poll if there are active mutli page jobs in progress
|
||||
const hasActive = await hasActiveJobs();
|
||||
const hasActive = await hasActiveJobs({ redisPubClient });
|
||||
if (!hasActive) {
|
||||
console.log('No active jobs in progress, skipping SQS poll');
|
||||
return;
|
||||
@@ -464,28 +327,40 @@ async function handleTextractNotification(message) {
|
||||
return;
|
||||
}
|
||||
|
||||
const jobInfo = await getTextractJob(textractJobId);
|
||||
const jobInfo = await getTextractJob({ redisPubClient, textractJobId });
|
||||
|
||||
if (status === 'SUCCEEDED') {
|
||||
// Retrieve the results
|
||||
const { processedData, originalResponse } = await retrieveTextractResults(textractJobId);
|
||||
|
||||
await setTextractJob(textractJobId, {
|
||||
...jobInfo,
|
||||
status: 'COMPLETED',
|
||||
data: {
|
||||
...processedData,
|
||||
originalTextractResponse: originalResponse
|
||||
},
|
||||
completedAt: new Date().toISOString()
|
||||
});
|
||||
await setTextractJob(
|
||||
{
|
||||
redisPubClient,
|
||||
textractJobId,
|
||||
jobData: {
|
||||
...jobInfo,
|
||||
status: 'COMPLETED',
|
||||
data: {
|
||||
...processedData,
|
||||
originalTextractResponse: originalResponse
|
||||
},
|
||||
completedAt: new Date().toISOString()
|
||||
}
|
||||
}
|
||||
);
|
||||
} else if (status === 'FAILED') {
|
||||
await setTextractJob(textractJobId, {
|
||||
...jobInfo,
|
||||
status: 'FAILED',
|
||||
error: snsMessage.StatusMessage || 'Textract job failed',
|
||||
completedAt: new Date().toISOString()
|
||||
});
|
||||
await setTextractJob(
|
||||
{
|
||||
redisPubClient,
|
||||
textractJobId,
|
||||
jobData: {
|
||||
...jobInfo,
|
||||
status: 'FAILED',
|
||||
error: snsMessage.StatusMessage || 'Textract job failed',
|
||||
completedAt: new Date().toISOString()
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -531,179 +406,6 @@ function startSQSPolling() {
|
||||
return pollInterval;
|
||||
}
|
||||
|
||||
function extractInvoiceData(textractResponse) {
|
||||
const invoiceData = {
|
||||
summary: {},
|
||||
lineItems: []
|
||||
};
|
||||
|
||||
if (!textractResponse.ExpenseDocuments || textractResponse.ExpenseDocuments.length === 0) {
|
||||
return invoiceData;
|
||||
}
|
||||
|
||||
// Process each page of the invoice
|
||||
textractResponse.ExpenseDocuments.forEach(expenseDoc => {
|
||||
// Extract summary fields (vendor, invoice number, date, total, etc.)
|
||||
if (expenseDoc.SummaryFields) {
|
||||
expenseDoc.SummaryFields.forEach(field => {
|
||||
const fieldType = field.Type?.Text || '';
|
||||
const fieldValue = field.ValueDetection?.Text || '';
|
||||
const fieldLabel = field.LabelDetection?.Text || '';
|
||||
const confidence = field.ValueDetection?.Confidence || 0;
|
||||
|
||||
// Map common invoice fields
|
||||
if (fieldType && fieldValue) {
|
||||
invoiceData.summary[fieldType] = {
|
||||
value: fieldValue,
|
||||
label: fieldLabel,
|
||||
normalizedLabel: normalizeLabelName(fieldLabel),
|
||||
confidence: confidence
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Extract line items
|
||||
if (expenseDoc.LineItemGroups) {
|
||||
expenseDoc.LineItemGroups.forEach(lineItemGroup => {
|
||||
if (lineItemGroup.LineItems) {
|
||||
lineItemGroup.LineItems.forEach(lineItem => {
|
||||
const item = {};
|
||||
|
||||
if (lineItem.LineItemExpenseFields) {
|
||||
lineItem.LineItemExpenseFields.forEach(field => {
|
||||
const fieldType = field.Type?.Text || '';
|
||||
const fieldValue = field.ValueDetection?.Text || '';
|
||||
const fieldLabel = field.LabelDetection?.Text || '';
|
||||
const confidence = field.ValueDetection?.Confidence || 0;
|
||||
|
||||
if (fieldType && fieldValue) {
|
||||
// Normalize field names
|
||||
const normalizedField = normalizeFieldName(fieldType);
|
||||
item[normalizedField] = {
|
||||
value: fieldValue,
|
||||
label: fieldLabel,
|
||||
normalizedLabel: normalizeLabelName(fieldLabel),
|
||||
confidence: confidence
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (Object.keys(item).length > 0) {
|
||||
invoiceData.lineItems.push(item);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return invoiceData;
|
||||
}
|
||||
|
||||
function normalizeFieldName(fieldType) {
|
||||
//Placeholder normalization for now.
|
||||
return fieldType;
|
||||
}
|
||||
|
||||
function normalizeLabelName(labelText) {
|
||||
if (!labelText) return '';
|
||||
|
||||
// Convert to lowercase and trim whitespace
|
||||
let normalized = labelText.toLowerCase().trim();
|
||||
|
||||
// Remove special characters and replace spaces with underscores
|
||||
normalized = normalized.replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, '_');
|
||||
const standardizedFieldsnames = {
|
||||
actual_cost: "actual_cost",
|
||||
actual_price: "actual_price",
|
||||
line_desc: "line_desc",
|
||||
quantity: "quantity",
|
||||
part_no: "part_no"
|
||||
}
|
||||
|
||||
// Common label normalizations
|
||||
const labelMap = {
|
||||
'qty': standardizedFieldsnames.quantity,
|
||||
'qnty': standardizedFieldsnames.quantity,
|
||||
'sale_qty': standardizedFieldsnames.quantity,
|
||||
'quant': standardizedFieldsnames.quantity,
|
||||
'desc': standardizedFieldsnames.line_desc,
|
||||
'description': standardizedFieldsnames.line_desc,
|
||||
'item': standardizedFieldsnames.line_desc,
|
||||
'part': standardizedFieldsnames.part_no,
|
||||
'part_no': standardizedFieldsnames.part_no,
|
||||
'part_num': standardizedFieldsnames.part_no,
|
||||
'part_number': standardizedFieldsnames.part_no,
|
||||
'price': standardizedFieldsnames.actual_price,
|
||||
'unit_price': standardizedFieldsnames.actual_price,
|
||||
'amount': standardizedFieldsnames.actual_price,
|
||||
'list_price': standardizedFieldsnames.actual_price,
|
||||
'list': standardizedFieldsnames.actual_price,
|
||||
'retail_price': standardizedFieldsnames.actual_price,
|
||||
'net': standardizedFieldsnames.actual_cost,
|
||||
'selling_price': standardizedFieldsnames.actual_cost,
|
||||
|
||||
};
|
||||
|
||||
return labelMap[normalized] || normalized; // TODO: Should we monitor unmapped labels?
|
||||
}
|
||||
|
||||
function processScanData(invoiceData) {
|
||||
// Process and clean the extracted data
|
||||
const processed = {
|
||||
summary: {},
|
||||
lineItems: []
|
||||
};
|
||||
|
||||
// Clean summary fields
|
||||
for (const [key, value] of Object.entries(invoiceData.summary)) {
|
||||
if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
|
||||
processed.summary[key] = {
|
||||
value: value.value,
|
||||
label: value.label,
|
||||
normalizedLabel: value.normalizedLabel,
|
||||
confidence: value.confidence
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Process line items
|
||||
processed.lineItems = invoiceData.lineItems
|
||||
.map(item => {
|
||||
const processedItem = {};
|
||||
|
||||
for (const [key, value] of Object.entries(item)) {
|
||||
if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
|
||||
let cleanValue = value.value;
|
||||
|
||||
// Parse numbers for quantity and price fields
|
||||
if (key === 'quantity') {
|
||||
cleanValue = parseFloat(cleanValue) || 0;
|
||||
} else if (key === 'retail_price' || key === 'actual_price') {
|
||||
// Remove currency symbols and parse
|
||||
cleanValue = parseFloat(cleanValue.replace(/[^0-9.-]/g, '')) || 0;
|
||||
}
|
||||
|
||||
processedItem[key] = {
|
||||
value: cleanValue,
|
||||
label: value.label,
|
||||
normalizedLabel: value.normalizedLabel,
|
||||
confidence: value.confidence
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return processedItem;
|
||||
})
|
||||
// .filter(item => {
|
||||
// // Filter out items with no description or with quantity <= 0
|
||||
// return item.description && (!item.quantity || item.quantity > 0);
|
||||
// });
|
||||
|
||||
return processed;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initializeBillOcr,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user