const PDFDocument = require('pdf-lib').PDFDocument; const TEXTRACT_REDIS_PREFIX = "textract:" const TEXTRACT_JOB_TTL = 3600; /** * Generate Redis key for Textract job using textract job ID * @param {string} textractJobId * @returns {string} */ function getTextractJobKey(textractJobId) { return `${TEXTRACT_REDIS_PREFIX}:${textractJobId}`; } /** * Store Textract job data in Redis * @param {string} textractJobId * @param {Object} redisPubClient * @param {Object} jobData */ async function setTextractJob({ redisPubClient, textractJobId, jobData }) { if (!redisPubClient) { throw new Error('Redis client not initialized. Call initializeBillOcr first.'); } const key = getTextractJobKey(textractJobId); await redisPubClient.set(key, JSON.stringify(jobData)); await redisPubClient.expire(key, TEXTRACT_JOB_TTL); } /** * Retrieve Textract job data from Redis * @param {string} textractJobId * @param {Object} redisPubClient * @returns {Promise} */ async function getTextractJob({ redisPubClient, textractJobId }) { if (!redisPubClient) { throw new Error('Redis client not initialized. Call initializeBillOcr first.'); } const key = getTextractJobKey(textractJobId); const data = await redisPubClient.get(key); return data ? JSON.parse(data) : null; } /** * Detect file type based on MIME type and file signature * @param {Object} file - Multer file object * @returns {string} 'pdf', 'image', or 'unknown' */ function getFileType(file) { // Check MIME type first const mimeType = file.mimetype?.toLowerCase(); if (mimeType === 'application/pdf') { return 'pdf'; } if (mimeType && mimeType.startsWith('image/')) { return 'image'; } // Fallback: Check file signature (magic bytes) const buffer = file.buffer; if (buffer && buffer.length > 4) { // PDF signature: %PDF if (buffer[0] === 0x25 && buffer[1] === 0x50 && buffer[2] === 0x44 && buffer[3] === 0x46) { return 'pdf'; } // JPEG signature: FF D8 FF if (buffer[0] === 0xFF && buffer[1] === 0xD8 && buffer[2] === 0xFF) { return 'image'; } // PNG signature: 89 50 4E 47 if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4E && buffer[3] === 0x47) { return 'image'; } // HEIC/HEIF: Check for ftyp followed by heic/heix/hevc/hevx if (buffer.length > 12) { const ftypIndex = buffer.indexOf(Buffer.from('ftyp')); if (ftypIndex > 0 && ftypIndex < 12) { const brand = buffer.slice(ftypIndex + 4, ftypIndex + 8).toString('ascii'); if (brand.startsWith('heic') || brand.startsWith('heix') || brand.startsWith('hevc') || brand.startsWith('hevx') || brand.startsWith('mif1')) { return 'image'; } } } } return 'unknown'; } /** * Get the number of pages in a PDF buffer * @param {Buffer} pdfBuffer * @returns {Promise} */ async function getPdfPageCount(pdfBuffer) { try { const pdfDoc = await PDFDocument.load(pdfBuffer); return pdfDoc.getPageCount(); } catch (error) { console.error('Error reading PDF page count:', error); throw new Error('Failed to read PDF: ' + error.message); } } /** * Check if there are any jobs in IN_PROGRESS status * @returns {Promise} */ async function hasActiveJobs({ redisPubClient }) { if (!redisPubClient) { throw new Error('Redis client not initialized.'); } try { // Get all textract job keys const pattern = `${TEXTRACT_REDIS_PREFIX}:*`; const keys = await redisPubClient.keys(pattern); if (!keys || keys.length === 0) { return false; } //TODO: Is there a better way to do this that supports clusters? // Check if any job has IN_PROGRESS status for (const key of keys) { const data = await redisPubClient.get(key); if (data) { const jobData = JSON.parse(data); if (jobData.status === 'IN_PROGRESS') { return true; } } } return false; } catch (error) { console.error('Error checking for active jobs:', error); return false; } } module.exports = { getTextractJobKey, setTextractJob, getTextractJob, getFileType, getPdfPageCount, hasActiveJobs, TEXTRACT_REDIS_PREFIX }