bodyshop/server/ai/bill-ocr/bill-ocr.js

const { TextractClient, StartExpenseAnalysisCommand, GetExpenseAnalysisCommand, AnalyzeExpenseCommand } = require("@aws-sdk/client-textract");
const { S3Client, PutObjectCommand } = require("@aws-sdk/client-s3");
const { SQSClient, ReceiveMessageCommand, DeleteMessageCommand } = require("@aws-sdk/client-sqs");
const { v4: uuidv4 } = require('uuid');
const PDFDocument = require('pdf-lib').PDFDocument;

// Initialize AWS clients
const awsConfig = {
    region: process.env.AWS_AI_REGION || "ca-central-1",
    credentials: {
        accessKeyId: process.env.AWS_AI_ACCESS_KEY_ID,
        secretAccessKey: process.env.AWS_AI_SECRET_ACCESS_KEY,
    }
};

const textractClient = new TextractClient(awsConfig);
const s3Client = new S3Client(awsConfig);
const sqsClient = new SQSClient(awsConfig);

let redisPubClient = null;
const TEXTRACT_JOB_TTL = 3600;
const MIN_CONFIDENCE_VALUE = 50

/**
 * Initialize the bill-ocr module with Redis client
 * @param {Object} pubClient - Redis cluster client
 */
function initializeBillOcr(pubClient) {
    redisPubClient = pubClient;
}

/**
 * Generate Redis key for Textract job using textract job ID
 * @param {string} textractJobId
 * @returns {string}
 */
function getTextractJobKey(textractJobId) {
    return `textract:job:${textractJobId}`;
}

/**
 * Store Textract job data in Redis
 * @param {string} textractJobId
 * @param {Object} jobData
 */
async function setTextractJob(textractJobId, jobData) {
    if (!redisPubClient) {
        throw new Error('Redis client not initialized. Call initializeBillOcr first.');
    }
    const key = getTextractJobKey(textractJobId);
    await redisPubClient.set(key, JSON.stringify(jobData));
    await redisPubClient.expire(key, TEXTRACT_JOB_TTL);
}

/**
 * Retrieve Textract job data from Redis
 * @param {string} textractJobId
 * @returns {Promise<Object|null>}
 */
async function getTextractJob(textractJobId) {
    if (!redisPubClient) {
        throw new Error('Redis client not initialized. Call initializeBillOcr first.');
    }
    const key = getTextractJobKey(textractJobId);
    const data = await redisPubClient.get(key);
    return data ? JSON.parse(data) : null;
}

/**
 * Check if job exists by Textract job ID
 * @param {string} textractJobId
 * @returns {Promise<boolean>}
 */
async function jobExists(textractJobId) {
    if (!redisPubClient) {
        throw new Error('Redis client not initialized. Call initializeBillOcr first.');
    }

    console.log('Checking if job exists for Textract job ID:', textractJobId);
    const key = getTextractJobKey(textractJobId);
    const exists = await redisPubClient.exists(key);

    if (exists) {
        console.log(`Job found: ${textractJobId}`);
        return true;
    }

    console.log('No matching job found in Redis');
    return false;
}

/**
 * Check if there are any jobs in IN_PROGRESS status
 * @returns {Promise<boolean>}
 */
async function hasActiveJobs() {
    if (!redisPubClient) {
        throw new Error('Redis client not initialized.');
    }

    try {
        // Get all textract job keys
        const pattern = 'textract:job:*';
        const keys = await redisPubClient.keys(pattern);

        if (!keys || keys.length === 0) {
            return false;
        }
        //TODO: Is there a better way to do this that supports clusters?
        // Check if any job has IN_PROGRESS status
        for (const key of keys) {
            const data = await redisPubClient.get(key);
            if (data) {
                const jobData = JSON.parse(data);
                if (jobData.status === 'IN_PROGRESS') {
                    return true;
                }
            }
        }

        return false;
    } catch (error) {
        console.error('Error checking for active jobs:', error);
        return false;
    }
}

async function handleBillOcr(request, response) {
    // Check if file was uploaded
    if (!request.file) {
        response.status(400).send({ error: 'No file uploaded.' });
        return;
    }

    // The uploaded file is available in request.file
    const uploadedFile = request.file;

    try {
        const fileType = getFileType(uploadedFile);
        console.log(`Processing file type: ${fileType}`);

        // Images are always processed synchronously (single page)
        if (fileType === 'image') {
            console.log('Image => 1 page, processing synchronously');
            const result = await processSinglePageDocument(uploadedFile.buffer);

            response.status(200).send({
                success: true,
                status: 'COMPLETED',
                data: result,
                message: 'Invoice processing completed'
            });
        } else if (fileType === 'pdf') {
            // Check the number of pages in the PDF
            const pageCount = await getPdfPageCount(uploadedFile.buffer);
            console.log(`PDF has ${pageCount} page(s)`);

            if (pageCount === 1) {
                // Process synchronously for single-page documents
                console.log('PDF => 1 page, processing synchronously');
                const result = await processSinglePageDocument(uploadedFile.buffer);

                response.status(200).send({
                    success: true,
                    status: 'COMPLETED',
                    data: result,
                    message: 'Invoice processing completed'
                });
            } else {
                // Start the Textract job (non-blocking) for multi-page documents
                console.log('PDF => 2+ pages, processing asynchronously');
                const jobInfo = await startTextractJob(uploadedFile.buffer);

                response.status(202).send({
                    success: true,
                    jobId: jobInfo.jobId,
                    message: 'Invoice processing started',
                    statusUrl: `/ai/bill-ocr/status/${jobInfo.jobId}`
                });
            }
        } else {
            response.status(400).send({
                error: 'Unsupported file type',
                message: 'Please upload a PDF or supported image file (JPEG, PNG, TIFF)'
            });
        }
    } catch (error) {
        console.error('Error starting invoice processing:', error);
        response.status(500).send({
            error: 'Failed to start invoice processing',
            message: error.message
        });
    }
}

async function handleBillOcrStatus(request, response) {
    console.log('handleBillOcrStatus called');
    console.log('request.params:', request.params);
    console.log('request.query:', request.query);


    const { jobId: textractJobId } = request.params;


    if (!textractJobId) {
        console.log('No textractJobId found in params');
        response.status(400).send({ error: 'Job ID is required' });
        return;
    }

    console.log('Looking for job:', textractJobId);
    const jobStatus = await getTextractJob(textractJobId);
    console.log('Job status:', jobStatus);

    if (!jobStatus) {
        response.status(404).send({ error: 'Job not found' });
        return;
    }

    if (jobStatus.status === 'COMPLETED') {
        response.status(200).send({
            status: 'COMPLETED',
            data: jobStatus.data
        });
    } else if (jobStatus.status === 'FAILED') {
        response.status(500).send({
            status: 'FAILED',
            error: jobStatus.error
        });
    } else {
        response.status(200).send({
            status: jobStatus.status
        });
    }
}

/**
 * Detect file type based on MIME type and file signature
 * @param {Object} file - Multer file object
 * @returns {string} 'pdf', 'image', or 'unknown'
 */
function getFileType(file) {
    // Check MIME type first
    const mimeType = file.mimetype?.toLowerCase();

    if (mimeType === 'application/pdf') {
        return 'pdf';
    }

    if (mimeType && mimeType.startsWith('image/')) {
        return 'image';
    }

    // Fallback: Check file signature (magic bytes)
    const buffer = file.buffer;
    if (buffer && buffer.length > 4) {
        // PDF signature: %PDF
        if (buffer[0] === 0x25 && buffer[1] === 0x50 && buffer[2] === 0x44 && buffer[3] === 0x46) {
            return 'pdf';
        }

        // JPEG signature: FF D8 FF
        if (buffer[0] === 0xFF && buffer[1] === 0xD8 && buffer[2] === 0xFF) {
            return 'image';
        }

        // PNG signature: 89 50 4E 47
        if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4E && buffer[3] === 0x47) {
            return 'image';
        }

        // HEIC/HEIF: Check for ftyp followed by heic/heix/hevc/hevx
        if (buffer.length > 12) {
            const ftypIndex = buffer.indexOf(Buffer.from('ftyp'));
            if (ftypIndex > 0 && ftypIndex < 12) {
                const brand = buffer.slice(ftypIndex + 4, ftypIndex + 8).toString('ascii');
                if (brand.startsWith('heic') || brand.startsWith('heix') ||
                    brand.startsWith('hevc') || brand.startsWith('hevx') ||
                    brand.startsWith('mif1')) {
                    return 'image';
                }
            }
        }
    }

    return 'unknown';
}

/**
 * Get the number of pages in a PDF buffer
 * @param {Buffer} pdfBuffer
 * @returns {Promise<number>}
 */
async function getPdfPageCount(pdfBuffer) {
    try {
        const pdfDoc = await PDFDocument.load(pdfBuffer);
        return pdfDoc.getPageCount();
    } catch (error) {
        console.error('Error reading PDF page count:', error);
        throw new Error('Failed to read PDF: ' + error.message);
    }
}

/**
 * Process a single-page document synchronously using AnalyzeExpenseCommand
 * @param {Buffer} pdfBuffer
 * @returns {Promise<Object>}
 */
async function processSinglePageDocument(pdfBuffer) {
    const analyzeCommand = new AnalyzeExpenseCommand({
        Document: {
            Bytes: pdfBuffer
        }
    });

    const result = await textractClient.send(analyzeCommand);
    const invoiceData = extractInvoiceData(result);
    const processedData = processScanData(invoiceData);

    return {
        ...processedData,
        originalTextractResponse: result
    };
}

async function startTextractJob(pdfBuffer) {
    // Upload PDF to S3 temporarily for Textract async processing
    const s3Bucket = process.env.AWS_AI_BUCKET;
    const snsTopicArn = process.env.AWS_TEXTRACT_SNS_TOPIC_ARN;
    const snsRoleArn = process.env.AWS_TEXTRACT_SNS_ROLE_ARN;

    if (!s3Bucket) {
        throw new Error('AWS_AI_BUCKET environment variable is required');
    }
    if (!snsTopicArn) {
        throw new Error('AWS_TEXTRACT_SNS_TOPIC_ARN environment variable is required');
    }
    if (!snsRoleArn) {
        throw new Error('AWS_TEXTRACT_SNS_ROLE_ARN environment variable is required');
    }

    const uploadId = uuidv4();
    const s3Key = `textract-temp/${uploadId}.pdf`; //TODO Update Keys structure to something better.

    // Upload to S3
    const uploadCommand = new PutObjectCommand({
        Bucket: s3Bucket,
        Key: s3Key,
        Body: pdfBuffer,
        ContentType: 'application/pdf' //Hard coded - we only support PDFs for multi-page
    });
    await s3Client.send(uploadCommand);

    // Start async Textract expense analysis with SNS notification
    const startCommand = new StartExpenseAnalysisCommand({
        DocumentLocation: {
            S3Object: {
                Bucket: s3Bucket,
                Name: s3Key
            }
        },
        NotificationChannel: {
            SNSTopicArn: snsTopicArn,
            RoleArn: snsRoleArn
        },
        ClientRequestToken: uploadId
    });

    const startResult = await textractClient.send(startCommand);
    const textractJobId = startResult.JobId;

    // Store job info in Redis using textractJobId as the key
    await setTextractJob(textractJobId, {
        status: 'IN_PROGRESS',
        s3Key: s3Key,
        uploadId: uploadId,
        startedAt: new Date().toISOString()
    });

    return {
        jobId: textractJobId
    };
}

// Process SQS messages from Textract completion notifications
async function processSQSMessages() {
    const queueUrl = process.env.AWS_TEXTRACT_SQS_QUEUE_URL;

    if (!queueUrl) {
        console.error('AWS_TEXTRACT_SQS_QUEUE_URL not configured');
        return;
    }

    // Only poll if there are active mutli page jobs in progress
    const hasActive = await hasActiveJobs();
    if (!hasActive) {
        console.log('No active jobs in progress, skipping SQS poll');
        return;
    }

    try {
        console.log('Polling SQS queue:', queueUrl);
        const receiveCommand = new ReceiveMessageCommand({
            QueueUrl: queueUrl,
            MaxNumberOfMessages: 10,
            WaitTimeSeconds: 20,
            MessageAttributeNames: ['All']
        });

        const result = await sqsClient.send(receiveCommand);
        console.log('SQS poll result:', result.Messages ? `${result.Messages.length} messages` : 'no messages');

        if (result.Messages && result.Messages.length > 0) {
            console.log('Processing', result.Messages.length, 'messages from SQS');
            for (const message of result.Messages) {
                try {
                    console.log("Processing message:", message);
                    await handleTextractNotification(message);

                    // Delete message after successful processing
                    const deleteCommand = new DeleteMessageCommand({
                        QueueUrl: queueUrl,
                        ReceiptHandle: message.ReceiptHandle
                    });
                    await sqsClient.send(deleteCommand);
                } catch (error) {
                    console.error('Error processing message:', error);
                }
            }
        }
    } catch (error) {
        console.error('Error receiving SQS messages:', error);
    }
}

async function handleTextractNotification(message) {
    const body = JSON.parse(message.Body);
    let snsMessage
    try {
        snsMessage = JSON.parse(body.Message);
    } catch (error) {
        //Delete the message so it doesn't clog the queue
        const deleteCommand = new DeleteMessageCommand({
            QueueUrl: process.env.AWS_TEXTRACT_SQS_QUEUE_URL,
            ReceiptHandle: message.ReceiptHandle
        });
        await sqsClient.send(deleteCommand);
        console.error('Error parsing SNS message:', error);
        console.log('Message Deleted:', body);
        return;
    }

    const textractJobId = snsMessage.JobId;
    const status = snsMessage.Status;

    // Check if job exists in Redis
    const exists = await jobExists(textractJobId);

    if (!exists) {
        console.warn(`Job not found for Textract job ID: ${textractJobId}`);
        return;
    }

    const jobInfo = await getTextractJob(textractJobId);

    if (status === 'SUCCEEDED') {
        // Retrieve the results
        const { processedData, originalResponse } = await retrieveTextractResults(textractJobId);

        await setTextractJob(textractJobId, {
            ...jobInfo,
            status: 'COMPLETED',
            data: {
                ...processedData,
                originalTextractResponse: originalResponse
            },
            completedAt: new Date().toISOString()
        });
    } else if (status === 'FAILED') {
        await setTextractJob(textractJobId, {
            ...jobInfo,
            status: 'FAILED',
            error: snsMessage.StatusMessage || 'Textract job failed',
            completedAt: new Date().toISOString()
        });
    }
}

async function retrieveTextractResults(textractJobId) {
    // Handle pagination if there are multiple pages of results
    let allExpenseDocuments = [];
    let nextToken = null;

    do {
        const getCommand = new GetExpenseAnalysisCommand({
            JobId: textractJobId,
            NextToken: nextToken
        });

        const result = await textractClient.send(getCommand);

        if (result.ExpenseDocuments) {
            allExpenseDocuments = allExpenseDocuments.concat(result.ExpenseDocuments);
        }

        nextToken = result.NextToken;
    } while (nextToken);

    // Store the complete original response
    const fullTextractResponse = { ExpenseDocuments: allExpenseDocuments };

    // Extract invoice data from Textract response
    const invoiceData = extractInvoiceData(fullTextractResponse);

    return {
        processedData: processScanData(invoiceData),
        originalResponse: fullTextractResponse
    };
}

// Start SQS polling (call this when server starts)
function startSQSPolling() {
    const pollInterval = setInterval(() => {
        processSQSMessages().catch(error => {
            console.error('SQS polling error:', error);
        });
    }, 10000); // Poll every 10 seconds
    return pollInterval;
}

function extractInvoiceData(textractResponse) {
    const invoiceData = {
        summary: {},
        lineItems: []
    };

    if (!textractResponse.ExpenseDocuments || textractResponse.ExpenseDocuments.length === 0) {
        return invoiceData;
    }

    // Process each page of the invoice
    textractResponse.ExpenseDocuments.forEach(expenseDoc => {
        // Extract summary fields (vendor, invoice number, date, total, etc.)
        if (expenseDoc.SummaryFields) {
            expenseDoc.SummaryFields.forEach(field => {
                const fieldType = field.Type?.Text || '';
                const fieldValue = field.ValueDetection?.Text || '';
                const fieldLabel = field.LabelDetection?.Text || '';
                const confidence = field.ValueDetection?.Confidence || 0;

                // Map common invoice fields
                if (fieldType && fieldValue) {
                    invoiceData.summary[fieldType] = {
                        value: fieldValue,
                        label: fieldLabel,
                        normalizedLabel: normalizeLabelName(fieldLabel),
                        confidence: confidence
                    };
                }
            });
        }

        // Extract line items
        if (expenseDoc.LineItemGroups) {
            expenseDoc.LineItemGroups.forEach(lineItemGroup => {
                if (lineItemGroup.LineItems) {
                    lineItemGroup.LineItems.forEach(lineItem => {
                        const item = {};

                        if (lineItem.LineItemExpenseFields) {
                            lineItem.LineItemExpenseFields.forEach(field => {
                                const fieldType = field.Type?.Text || '';
                                const fieldValue = field.ValueDetection?.Text || '';
                                const fieldLabel = field.LabelDetection?.Text || '';
                                const confidence = field.ValueDetection?.Confidence || 0;

                                if (fieldType && fieldValue) {
                                    // Normalize field names
                                    const normalizedField = normalizeFieldName(fieldType);
                                    item[normalizedField] = {
                                        value: fieldValue,
                                        label: fieldLabel,
                                        normalizedLabel: normalizeLabelName(fieldLabel),
                                        confidence: confidence
                                    };
                                }
                            });
                        }

                        if (Object.keys(item).length > 0) {
                            invoiceData.lineItems.push(item);
                        }
                    });
                }
            });
        }
    });

    return invoiceData;
}

function normalizeFieldName(fieldType) {
    //Placeholder normalization for now.
    return fieldType;
}

function normalizeLabelName(labelText) {
    if (!labelText) return '';

    // Convert to lowercase and trim whitespace
    let normalized = labelText.toLowerCase().trim();

    // Remove special characters and replace spaces with underscores
    normalized = normalized.replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, '_');
    const standardizedFieldsnames = {
        actual_cost: "actual_cost",
        actual_price: "actual_price",
        line_desc: "line_desc",
        quantity: "quantity",
        part_no: "part_no"
    }

    // Common label normalizations
    const labelMap = {
        'qty': standardizedFieldsnames.quantity,
        'qnty': standardizedFieldsnames.quantity,
        'sale_qty': standardizedFieldsnames.quantity,
        'quant': standardizedFieldsnames.quantity,
        'desc': standardizedFieldsnames.line_desc,
        'description': standardizedFieldsnames.line_desc,
        'item': standardizedFieldsnames.line_desc,
        'part': standardizedFieldsnames.part_no,
        'part_no': standardizedFieldsnames.part_no,
        'part_num': standardizedFieldsnames.part_no,
        'part_number': standardizedFieldsnames.part_no,
        'price': standardizedFieldsnames.actual_price,
        'unit_price': standardizedFieldsnames.actual_price,
        'amount': standardizedFieldsnames.actual_price,
        'list_price': standardizedFieldsnames.actual_price,
        'list': standardizedFieldsnames.actual_price,
        'retail_price': standardizedFieldsnames.actual_price,
        'net': standardizedFieldsnames.actual_cost,
        'selling_price': standardizedFieldsnames.actual_cost,

    };

    return labelMap[normalized] || normalized; // TODO: Should we monitor unmapped labels?
}

function processScanData(invoiceData) {
    // Process and clean the extracted data
    const processed = {
        summary: {},
        lineItems: []
    };

    // Clean summary fields
    for (const [key, value] of Object.entries(invoiceData.summary)) {
        if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
            processed.summary[key] = {
                value: value.value,
                label: value.label,
                normalizedLabel: value.normalizedLabel,
                confidence: value.confidence
            };
        }
    }

    // Process line items
    processed.lineItems = invoiceData.lineItems
        .map(item => {
            const processedItem = {};

            for (const [key, value] of Object.entries(item)) {
                if (value.confidence > MIN_CONFIDENCE_VALUE) { // Only include fields with > 50% confidence
                    let cleanValue = value.value;

                    // Parse numbers for quantity and price fields
                    if (key === 'quantity') {
                        cleanValue = parseFloat(cleanValue) || 0;
                    } else if (key === 'retail_price' || key === 'actual_price') {
                        // Remove currency symbols and parse
                        cleanValue = parseFloat(cleanValue.replace(/[^0-9.-]/g, '')) || 0;
                    }

                    processedItem[key] = {
                        value: cleanValue,
                        label: value.label,
                        normalizedLabel: value.normalizedLabel,
                        confidence: value.confidence
                    };
                }
            }

            return processedItem;
        })
    // .filter(item => {
    //     // Filter out items with no description or with quantity <= 0
    //     return item.description && (!item.quantity || item.quantity > 0);
    // });

    return processed;
}

module.exports = {
    initializeBillOcr,
    handleBillOcr,
    handleBillOcrStatus,
    startSQSPolling
};