IO-3515 WIP - bulk calls functioning. Further refinement required.

This commit is contained in:
Patrick Fic
2026-01-26 16:09:58 -08:00
parent c3718fff87
commit 2a6d0446f0
2 changed files with 117 additions and 36 deletions

View File

@@ -16,8 +16,76 @@ const textractClient = new TextractClient(awsConfig);
const s3Client = new S3Client(awsConfig);
const sqsClient = new SQSClient(awsConfig);
// In-memory job storage (consider using Redis or a database for production)
const jobStore = new Map();
let redisPubClient = null;
const TEXTRACT_JOB_TTL = 3600;
/**
* Initialize the bill-ocr module with Redis client
* @param {Object} pubClient - Redis cluster client
*/
function initializeBillOcr(pubClient) {
redisPubClient = pubClient;
}
/**
* Generate Redis key for Textract job using textract job ID
* @param {string} textractJobId
* @returns {string}
*/
function getTextractJobKey(textractJobId) {
return `textract:job:${textractJobId}`;
}
/**
* Store Textract job data in Redis
* @param {string} textractJobId
* @param {Object} jobData
*/
async function setTextractJob(textractJobId, jobData) {
if (!redisPubClient) {
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
}
const key = getTextractJobKey(textractJobId);
await redisPubClient.set(key, JSON.stringify(jobData));
await redisPubClient.expire(key, TEXTRACT_JOB_TTL);
}
/**
* Retrieve Textract job data from Redis
* @param {string} textractJobId
* @returns {Promise<Object|null>}
*/
async function getTextractJob(textractJobId) {
if (!redisPubClient) {
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
}
const key = getTextractJobKey(textractJobId);
const data = await redisPubClient.get(key);
return data ? JSON.parse(data) : null;
}
/**
* Check if job exists by Textract job ID
* @param {string} textractJobId
* @returns {Promise<boolean>}
*/
async function jobExists(textractJobId) {
if (!redisPubClient) {
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
}
console.log('Checking if job exists for Textract job ID:', textractJobId);
const key = getTextractJobKey(textractJobId);
const exists = await redisPubClient.exists(key);
if (exists) {
console.log(`Job found: ${textractJobId}`);
return true;
}
console.log('No matching job found in Redis');
return false;
}
async function handleBillOcr(request, response) {
// Check if file was uploaded
@@ -37,7 +105,7 @@ async function handleBillOcr(request, response) {
success: true,
jobId: jobInfo.jobId,
message: 'Invoice processing started',
statusUrl: `/api/bill-ocr/status/${jobInfo.jobId}`
statusUrl: `/ai/bill-ocr/status/${jobInfo.jobId}`
});
} catch (error) {
console.error('Error starting invoice processing:', error);
@@ -53,16 +121,21 @@ async function handleBillOcrStatus(request, response) {
console.log('request.params:', request.params);
console.log('request.query:', request.query);
const { jobId } = request.params;
if (!jobId) {
console.log('No jobId found in params');
const { jobId: textractJobId } = request.params;
if (!textractJobId) {
console.log('No textractJobId found in params');
response.status(400).send({ error: 'Job ID is required' });
return;
}
console.log('Looking for job:', jobId);
const jobStatus = jobStore.get(jobId);
console.log('Looking for job:', textractJobId);
const jobStatus = await getTextractJob(textractJobId);
console.log('Job status:', jobStatus);
if (!jobStatus) {
@@ -103,8 +176,8 @@ async function startTextractJob(pdfBuffer) {
throw new Error('AWS_TEXTRACT_SNS_ROLE_ARN environment variable is required');
}
const jobId = uuidv4();
const s3Key = `textract-temp/${jobId}.pdf`;
const uploadId = uuidv4();
const s3Key = `textract-temp/${uploadId}.pdf`;
// Upload to S3
const uploadCommand = new PutObjectCommand({
@@ -123,30 +196,26 @@ async function startTextractJob(pdfBuffer) {
Name: s3Key
}
},
OutputConfig: {
S3Bucket: s3Bucket,
S3Prefix: `textract-output/${jobId}/`
},
NotificationChannel: {
SNSTopicArn: snsTopicArn,
RoleArn: snsRoleArn
},
ClientRequestToken: jobId
ClientRequestToken: uploadId
});
const startResult = await textractClient.send(startCommand);
const textractJobId = startResult.JobId;
// Store job info
jobStore.set(jobId, {
// Store job info in Redis using textractJobId as the key
await setTextractJob(textractJobId, {
status: 'IN_PROGRESS',
textractJobId: startResult.JobId,
s3Key: s3Key,
uploadId: uploadId,
startedAt: new Date().toISOString()
});
return {
jobId: jobId,
textractJobId: startResult.JobId
jobId: textractJobId
};
}
@@ -196,40 +265,48 @@ async function processSQSMessages() {
async function handleTextractNotification(message) {
const body = JSON.parse(message.Body);
const snsMessage = JSON.parse(body.Message);
let snsMessage
try {
snsMessage = JSON.parse(body.Message);
} catch (error) {
//Delete the message so it doesn't clog the queue
const deleteCommand = new DeleteMessageCommand({
QueueUrl: process.env.AWS_TEXTRACT_SQS_QUEUE_URL,
ReceiptHandle: message.ReceiptHandle
});
await sqsClient.send(deleteCommand);
console.error('Error parsing SNS message:', error);
console.log('Message Deleted:', body);
return;
}
const textractJobId = snsMessage.JobId;
const status = snsMessage.Status;
// Find our job by Textract job ID
let ourJobId = null;
for (const [key, value] of jobStore.entries()) {
if (value.textractJobId === textractJobId) {
ourJobId = key;
break;
}
}
// Check if job exists in Redis
const exists = await jobExists(textractJobId);
if (!ourJobId) {
if (!exists) {
console.warn(`Job not found for Textract job ID: ${textractJobId}`);
return;
}
const jobInfo = jobStore.get(ourJobId);
const jobInfo = await getTextractJob(textractJobId);
if (status === 'SUCCEEDED') {
// Retrieve the results
const invoiceData = await retrieveTextractResults(textractJobId);
const processedData = processScanData(invoiceData);
jobStore.set(ourJobId, {
await setTextractJob(textractJobId, {
...jobInfo,
status: 'COMPLETED',
data: processedData,
completedAt: new Date().toISOString()
});
} else if (status === 'FAILED') {
jobStore.set(ourJobId, {
await setTextractJob(textractJobId, {
...jobInfo,
status: 'FAILED',
error: snsMessage.StatusMessage || 'Textract job failed',
@@ -268,7 +345,7 @@ function startSQSPolling() {
processSQSMessages().catch(error => {
console.error('SQS polling error:', error);
});
}, 5000); // Poll every 5 seconds
}, 10000); // Poll every 10 seconds
return pollInterval;
}
@@ -339,7 +416,7 @@ function extractInvoiceData(textractResponse) {
}
function normalizeFieldName(fieldType) {
// Convert Textract field types to more readable names
//Placeholder normalization for now.
const fieldMap = {
'ITEM': 'description',
'QUANTITY': 'quantity',
@@ -398,6 +475,7 @@ function processScanData(invoiceData) {
}
module.exports = {
initializeBillOcr,
handleBillOcr,
handleBillOcrStatus,
startSQSPolling