IO-3515 WIP - bulk calls functioning. Further refinement required.

This commit is contained in:
Patrick Fic
2026-01-26 16:09:58 -08:00
parent c3718fff87
commit 2a6d0446f0
2 changed files with 117 additions and 36 deletions

View File

@@ -441,8 +441,11 @@ const main = async () => {
await server.listen(port); await server.listen(port);
logger.log(`Server started on port ${port}`, "INFO", "api"); logger.log(`Server started on port ${port}`, "INFO", "api");
// Initialize bill-ocr with Redis client
const { initializeBillOcr, startSQSPolling } = require("./server/ai/bill-ocr/bill-ocr");
initializeBillOcr(pubClient);
// Start SQS polling for Textract notifications // Start SQS polling for Textract notifications
const { startSQSPolling } = require("./server/ai/bill-ocr/bill-ocr");
startSQSPolling(); startSQSPolling();
logger.log(`Started SQS polling for Textract notifications`, "INFO", "api"); logger.log(`Started SQS polling for Textract notifications`, "INFO", "api");
} catch (error) { } catch (error) {

View File

@@ -16,8 +16,76 @@ const textractClient = new TextractClient(awsConfig);
const s3Client = new S3Client(awsConfig); const s3Client = new S3Client(awsConfig);
const sqsClient = new SQSClient(awsConfig); const sqsClient = new SQSClient(awsConfig);
// In-memory job storage (consider using Redis or a database for production) let redisPubClient = null;
const jobStore = new Map(); const TEXTRACT_JOB_TTL = 3600;
/**
* Initialize the bill-ocr module with Redis client
* @param {Object} pubClient - Redis cluster client
*/
function initializeBillOcr(pubClient) {
redisPubClient = pubClient;
}
/**
* Generate Redis key for Textract job using textract job ID
* @param {string} textractJobId
* @returns {string}
*/
function getTextractJobKey(textractJobId) {
return `textract:job:${textractJobId}`;
}
/**
* Store Textract job data in Redis
* @param {string} textractJobId
* @param {Object} jobData
*/
async function setTextractJob(textractJobId, jobData) {
if (!redisPubClient) {
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
}
const key = getTextractJobKey(textractJobId);
await redisPubClient.set(key, JSON.stringify(jobData));
await redisPubClient.expire(key, TEXTRACT_JOB_TTL);
}
/**
* Retrieve Textract job data from Redis
* @param {string} textractJobId
* @returns {Promise<Object|null>}
*/
async function getTextractJob(textractJobId) {
if (!redisPubClient) {
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
}
const key = getTextractJobKey(textractJobId);
const data = await redisPubClient.get(key);
return data ? JSON.parse(data) : null;
}
/**
* Check if job exists by Textract job ID
* @param {string} textractJobId
* @returns {Promise<boolean>}
*/
async function jobExists(textractJobId) {
if (!redisPubClient) {
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
}
console.log('Checking if job exists for Textract job ID:', textractJobId);
const key = getTextractJobKey(textractJobId);
const exists = await redisPubClient.exists(key);
if (exists) {
console.log(`Job found: ${textractJobId}`);
return true;
}
console.log('No matching job found in Redis');
return false;
}
async function handleBillOcr(request, response) { async function handleBillOcr(request, response) {
// Check if file was uploaded // Check if file was uploaded
@@ -37,7 +105,7 @@ async function handleBillOcr(request, response) {
success: true, success: true,
jobId: jobInfo.jobId, jobId: jobInfo.jobId,
message: 'Invoice processing started', message: 'Invoice processing started',
statusUrl: `/api/bill-ocr/status/${jobInfo.jobId}` statusUrl: `/ai/bill-ocr/status/${jobInfo.jobId}`
}); });
} catch (error) { } catch (error) {
console.error('Error starting invoice processing:', error); console.error('Error starting invoice processing:', error);
@@ -53,16 +121,21 @@ async function handleBillOcrStatus(request, response) {
console.log('request.params:', request.params); console.log('request.params:', request.params);
console.log('request.query:', request.query); console.log('request.query:', request.query);
const { jobId } = request.params;
if (!jobId) {
console.log('No jobId found in params');
const { jobId: textractJobId } = request.params;
if (!textractJobId) {
console.log('No textractJobId found in params');
response.status(400).send({ error: 'Job ID is required' }); response.status(400).send({ error: 'Job ID is required' });
return; return;
} }
console.log('Looking for job:', jobId); console.log('Looking for job:', textractJobId);
const jobStatus = jobStore.get(jobId); const jobStatus = await getTextractJob(textractJobId);
console.log('Job status:', jobStatus); console.log('Job status:', jobStatus);
if (!jobStatus) { if (!jobStatus) {
@@ -103,8 +176,8 @@ async function startTextractJob(pdfBuffer) {
throw new Error('AWS_TEXTRACT_SNS_ROLE_ARN environment variable is required'); throw new Error('AWS_TEXTRACT_SNS_ROLE_ARN environment variable is required');
} }
const jobId = uuidv4(); const uploadId = uuidv4();
const s3Key = `textract-temp/${jobId}.pdf`; const s3Key = `textract-temp/${uploadId}.pdf`;
// Upload to S3 // Upload to S3
const uploadCommand = new PutObjectCommand({ const uploadCommand = new PutObjectCommand({
@@ -123,30 +196,26 @@ async function startTextractJob(pdfBuffer) {
Name: s3Key Name: s3Key
} }
}, },
OutputConfig: {
S3Bucket: s3Bucket,
S3Prefix: `textract-output/${jobId}/`
},
NotificationChannel: { NotificationChannel: {
SNSTopicArn: snsTopicArn, SNSTopicArn: snsTopicArn,
RoleArn: snsRoleArn RoleArn: snsRoleArn
}, },
ClientRequestToken: jobId ClientRequestToken: uploadId
}); });
const startResult = await textractClient.send(startCommand); const startResult = await textractClient.send(startCommand);
const textractJobId = startResult.JobId;
// Store job info // Store job info in Redis using textractJobId as the key
jobStore.set(jobId, { await setTextractJob(textractJobId, {
status: 'IN_PROGRESS', status: 'IN_PROGRESS',
textractJobId: startResult.JobId,
s3Key: s3Key, s3Key: s3Key,
uploadId: uploadId,
startedAt: new Date().toISOString() startedAt: new Date().toISOString()
}); });
return { return {
jobId: jobId, jobId: textractJobId
textractJobId: startResult.JobId
}; };
} }
@@ -196,40 +265,48 @@ async function processSQSMessages() {
async function handleTextractNotification(message) { async function handleTextractNotification(message) {
const body = JSON.parse(message.Body); const body = JSON.parse(message.Body);
const snsMessage = JSON.parse(body.Message); let snsMessage
try {
snsMessage = JSON.parse(body.Message);
} catch (error) {
//Delete the message so it doesn't clog the queue
const deleteCommand = new DeleteMessageCommand({
QueueUrl: process.env.AWS_TEXTRACT_SQS_QUEUE_URL,
ReceiptHandle: message.ReceiptHandle
});
await sqsClient.send(deleteCommand);
console.error('Error parsing SNS message:', error);
console.log('Message Deleted:', body);
return;
}
const textractJobId = snsMessage.JobId; const textractJobId = snsMessage.JobId;
const status = snsMessage.Status; const status = snsMessage.Status;
// Find our job by Textract job ID // Check if job exists in Redis
let ourJobId = null; const exists = await jobExists(textractJobId);
for (const [key, value] of jobStore.entries()) {
if (value.textractJobId === textractJobId) {
ourJobId = key;
break;
}
}
if (!ourJobId) { if (!exists) {
console.warn(`Job not found for Textract job ID: ${textractJobId}`); console.warn(`Job not found for Textract job ID: ${textractJobId}`);
return; return;
} }
const jobInfo = jobStore.get(ourJobId); const jobInfo = await getTextractJob(textractJobId);
if (status === 'SUCCEEDED') { if (status === 'SUCCEEDED') {
// Retrieve the results // Retrieve the results
const invoiceData = await retrieveTextractResults(textractJobId); const invoiceData = await retrieveTextractResults(textractJobId);
const processedData = processScanData(invoiceData); const processedData = processScanData(invoiceData);
jobStore.set(ourJobId, { await setTextractJob(textractJobId, {
...jobInfo, ...jobInfo,
status: 'COMPLETED', status: 'COMPLETED',
data: processedData, data: processedData,
completedAt: new Date().toISOString() completedAt: new Date().toISOString()
}); });
} else if (status === 'FAILED') { } else if (status === 'FAILED') {
jobStore.set(ourJobId, { await setTextractJob(textractJobId, {
...jobInfo, ...jobInfo,
status: 'FAILED', status: 'FAILED',
error: snsMessage.StatusMessage || 'Textract job failed', error: snsMessage.StatusMessage || 'Textract job failed',
@@ -268,7 +345,7 @@ function startSQSPolling() {
processSQSMessages().catch(error => { processSQSMessages().catch(error => {
console.error('SQS polling error:', error); console.error('SQS polling error:', error);
}); });
}, 5000); // Poll every 5 seconds }, 10000); // Poll every 10 seconds
return pollInterval; return pollInterval;
} }
@@ -339,7 +416,7 @@ function extractInvoiceData(textractResponse) {
} }
function normalizeFieldName(fieldType) { function normalizeFieldName(fieldType) {
// Convert Textract field types to more readable names //Placeholder normalization for now.
const fieldMap = { const fieldMap = {
'ITEM': 'description', 'ITEM': 'description',
'QUANTITY': 'quantity', 'QUANTITY': 'quantity',
@@ -398,6 +475,7 @@ function processScanData(invoiceData) {
} }
module.exports = { module.exports = {
initializeBillOcr,
handleBillOcr, handleBillOcr,
handleBillOcrStatus, handleBillOcrStatus,
startSQSPolling startSQSPolling