160 lines
4.6 KiB
JavaScript
160 lines
4.6 KiB
JavaScript
const PDFDocument = require('pdf-lib').PDFDocument;
|
|
const logger = require("../../utils/logger");
|
|
const TEXTRACT_REDIS_PREFIX = `textract:${process.env?.NODE_ENV}`
|
|
const TEXTRACT_JOB_TTL = 10 * 60;
|
|
|
|
|
|
/**
|
|
* Generate Redis key for Textract job using textract job ID
|
|
* @param {string} textractJobId
|
|
* @returns {string}
|
|
*/
|
|
function getTextractJobKey(textractJobId) {
|
|
return `${TEXTRACT_REDIS_PREFIX}:${textractJobId}`;
|
|
}
|
|
|
|
|
|
/**
|
|
* Store Textract job data in Redis
|
|
* @param {string} textractJobId
|
|
* @param {Object} redisPubClient
|
|
* @param {Object} jobData
|
|
*/
|
|
async function setTextractJob({ redisPubClient, textractJobId, jobData }) {
|
|
if (!redisPubClient) {
|
|
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
|
|
}
|
|
const key = getTextractJobKey(textractJobId);
|
|
await redisPubClient.set(key, JSON.stringify(jobData));
|
|
await redisPubClient.expire(key, TEXTRACT_JOB_TTL);
|
|
}
|
|
|
|
/**
|
|
* Retrieve Textract job data from Redis
|
|
* @param {string} textractJobId
|
|
* @param {Object} redisPubClient
|
|
* @returns {Promise<Object|null>}
|
|
*/
|
|
async function getTextractJob({ redisPubClient, textractJobId }) {
|
|
if (!redisPubClient) {
|
|
throw new Error('Redis client not initialized. Call initializeBillOcr first.');
|
|
}
|
|
const key = getTextractJobKey(textractJobId);
|
|
const data = await redisPubClient.get(key);
|
|
return data ? JSON.parse(data) : null;
|
|
}
|
|
|
|
/**
|
|
* Detect file type based on MIME type and file signature
|
|
* @param {Object} file - Multer file object
|
|
* @returns {string} 'pdf', 'image', or 'unknown'
|
|
*/
|
|
function getFileType(file) {
|
|
// Check MIME type first
|
|
const mimeType = file.mimetype?.toLowerCase();
|
|
|
|
if (mimeType === 'application/pdf') {
|
|
return 'pdf';
|
|
}
|
|
|
|
if (mimeType && mimeType.startsWith('image/')) {
|
|
return 'image';
|
|
}
|
|
|
|
// Fallback: Check file signature (magic bytes)
|
|
const buffer = file.buffer;
|
|
if (buffer && buffer.length > 4) {
|
|
// PDF signature: %PDF
|
|
if (buffer[0] === 0x25 && buffer[1] === 0x50 && buffer[2] === 0x44 && buffer[3] === 0x46) {
|
|
return 'pdf';
|
|
}
|
|
|
|
// JPEG signature: FF D8 FF
|
|
if (buffer[0] === 0xFF && buffer[1] === 0xD8 && buffer[2] === 0xFF) {
|
|
return 'image';
|
|
}
|
|
|
|
// PNG signature: 89 50 4E 47
|
|
if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4E && buffer[3] === 0x47) {
|
|
return 'image';
|
|
}
|
|
|
|
// HEIC/HEIF: Check for ftyp followed by heic/heix/hevc/hevx
|
|
if (buffer.length > 12) {
|
|
const ftypIndex = buffer.indexOf(Buffer.from('ftyp'));
|
|
if (ftypIndex > 0 && ftypIndex < 12) {
|
|
const brand = buffer.slice(ftypIndex + 4, ftypIndex + 8).toString('ascii');
|
|
if (brand.startsWith('heic') || brand.startsWith('heix') ||
|
|
brand.startsWith('hevc') || brand.startsWith('hevx') ||
|
|
brand.startsWith('mif1')) {
|
|
return 'image';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return 'unknown';
|
|
}
|
|
|
|
/**
|
|
* Get the number of pages in a PDF buffer
|
|
* @param {Buffer} pdfBuffer
|
|
* @returns {Promise<number>}
|
|
*/
|
|
async function getPdfPageCount(pdfBuffer) {
|
|
try {
|
|
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
|
return pdfDoc.getPageCount();
|
|
} catch (error) {
|
|
console.error('Error reading PDF page count:', error);
|
|
throw new Error('Failed to read PDF: ' + error.message);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if there are any jobs in IN_PROGRESS status
|
|
* @returns {Promise<boolean>}
|
|
*/
|
|
async function hasActiveJobs({ redisPubClient }) {
|
|
if (!redisPubClient) {
|
|
throw new Error('Redis client not initialized.');
|
|
}
|
|
|
|
try {
|
|
// Get all textract job keys
|
|
const pattern = `${TEXTRACT_REDIS_PREFIX}:*`;
|
|
const keys = await redisPubClient.keys(pattern);
|
|
|
|
if (!keys || keys.length === 0) {
|
|
return false;
|
|
}
|
|
//TODO: Is there a better way to do this that supports clusters?
|
|
// Check if any job has IN_PROGRESS status
|
|
for (const key of keys) {
|
|
const data = await redisPubClient.get(key);
|
|
if (data) {
|
|
const jobData = JSON.parse(data);
|
|
if (jobData.status === 'IN_PROGRESS') {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
} catch (error) {
|
|
logger.log("bill-ocr-job-check-error", "ERROR", "api", null, { error: error.message, stack: error.stack });
|
|
return false;
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
getTextractJobKey,
|
|
setTextractJob,
|
|
getTextractJob,
|
|
getFileType,
|
|
getPdfPageCount,
|
|
hasActiveJobs,
|
|
TEXTRACT_REDIS_PREFIX
|
|
}
|
|
|