update unique counts.

This commit is contained in:
Patrick Fic
2025-11-17 16:28:40 -08:00
parent e8da1a6c26
commit 2ca135fd55

View File

@@ -8,6 +8,8 @@ import { FolderPaths } from "./serverInit.js";
import axios from "axios"; import axios from "axios";
import { UUID } from "crypto"; import { UUID } from "crypto";
import fsPromises from "fs/promises"; import fsPromises from "fs/promises";
import crypto from "crypto";
import { createReadStream } from "fs";
const execAsync = promisify(exec); const execAsync = promisify(exec);
@@ -24,6 +26,8 @@ export interface JobFolderStats {
jobid: UUID | string | null; jobid: UUID | string | null;
//relativePath: string; //relativePath: string;
document_count: number; document_count: number;
unique_document_count: number;
duplicate_count: number;
total_size_bytes: number; total_size_bytes: number;
total_size_mb: number; total_size_mb: number;
file_type_stats: { [extension: string]: number }; file_type_stats: { [extension: string]: number };
@@ -33,6 +37,8 @@ export interface JobsDirectoryAnalysis {
bodyshopid: UUID; bodyshopid: UUID;
total_jobs: number; total_jobs: number;
total_documents: number; total_documents: number;
unique_documents: number;
duplicate_documents: number;
total_size_bytes: number; total_size_bytes: number;
total_size_mb: number; total_size_mb: number;
file_type_stats: { [extension: string]: number }; file_type_stats: { [extension: string]: number };
@@ -203,6 +209,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
bodyshopid, bodyshopid,
total_jobs: 0, total_jobs: 0,
total_documents: 0, total_documents: 0,
unique_documents: 0,
duplicate_documents: 0,
total_size_bytes: 0, total_size_bytes: 0,
total_size_mb: 0, total_size_mb: 0,
file_type_stats: {}, file_type_stats: {},
@@ -216,6 +224,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
const jobFolders = await readdir(jobsPath); const jobFolders = await readdir(jobsPath);
let total_documents = 0; let total_documents = 0;
let total_unique_documents = 0;
let total_duplicate_documents = 0;
let total_size_bytes = 0; let total_size_bytes = 0;
let total_jobs = 0; let total_jobs = 0;
const aggregated_file_type_stats: { [extension: string]: number } = {}; const aggregated_file_type_stats: { [extension: string]: number } = {};
@@ -237,6 +247,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
total_jobs++; total_jobs++;
total_documents += folderStats.document_count; total_documents += folderStats.document_count;
total_unique_documents += folderStats.unique_document_count;
total_duplicate_documents += folderStats.duplicate_count;
total_size_bytes += folderStats.total_size_bytes; total_size_bytes += folderStats.total_size_bytes;
// Aggregate file type stats // Aggregate file type stats
@@ -268,6 +280,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
bodyshopid, //read from the config.json file in the root directory bodyshopid, //read from the config.json file in the root directory
total_jobs, total_jobs,
total_documents, total_documents,
unique_documents: total_unique_documents,
duplicate_documents: total_duplicate_documents,
total_size_bytes, total_size_bytes,
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100, total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
file_type_stats: aggregated_file_type_stats, file_type_stats: aggregated_file_type_stats,
@@ -275,9 +289,9 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
}; };
logger.info( logger.info(
`Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents, ${analysis.total_size_mb} MB` `Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents (${analysis.unique_documents} unique, ${analysis.duplicate_documents} duplicates), ${analysis.total_size_mb} MB`
); );
//Add an upload to the IO database to categorize all of this. //Add an upload to the IO database to categorize all of this.
const apiURL = process.env.IS_TEST const apiURL = process.env.IS_TEST
? "https://api.test.imex.online/analytics/documents" ? "https://api.test.imex.online/analytics/documents"
@@ -311,7 +325,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUID): Promise<JobFolderStats> { async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUID): Promise<JobFolderStats> {
const jobFolderPath = path.join(jobsPath, jobid); const jobFolderPath = path.join(jobsPath, jobid);
const { document_count, total_size_bytes, file_type_stats } = await getDirectoryStats(jobFolderPath); const { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats } =
await getDirectoryStats(jobFolderPath);
const uuidRegex = /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/; const uuidRegex = /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/;
let validJobid: UUID | string | null = null; let validJobid: UUID | string | null = null;
@@ -327,6 +342,8 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
bodyshopid, bodyshopid,
jobid: validJobid, jobid: validJobid,
document_count, document_count,
unique_document_count,
duplicate_count,
total_size_bytes, total_size_bytes,
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100, total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
file_type_stats file_type_stats
@@ -334,14 +351,40 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
} }
/** /**
* Recursively get document count and total size for a directory (helper function) * Calculate SHA256 hash of a file's content
*/ */
async function getDirectoryStats( async function calculateFileHash(filePath: string): Promise<string> {
dirPath: string return new Promise((resolve, reject) => {
): Promise<{ document_count: number; total_size_bytes: number; file_type_stats: { [extension: string]: number } }> { const hash = crypto.createHash("sha256");
const stream = createReadStream(filePath);
stream.on("data", (data: string | Buffer) => hash.update(data));
stream.on("end", () => resolve(hash.digest("hex")));
stream.on("error", (error: Error) => {
console.error("Hashing error:", error);
reject(error);
});
});
}
/**
* Recursively get document count and total size for a directory (helper function)
* Now with duplicate detection using content-based hashing
*/
async function getDirectoryStats(dirPath: string): Promise<{
document_count: number;
unique_document_count: number;
duplicate_count: number;
total_size_bytes: number;
file_type_stats: { [extension: string]: number };
contentHashes: Set<string>;
}> {
let document_count = 0; let document_count = 0;
let unique_document_count = 0;
let duplicate_count = 0;
let total_size_bytes = 0; let total_size_bytes = 0;
const file_type_stats: { [extension: string]: number } = {}; const file_type_stats: { [extension: string]: number } = {};
const contentHashes = new Set<string>();
try { try {
const items = await readdir(dirPath); const items = await readdir(dirPath);
@@ -360,12 +403,19 @@ async function getDirectoryStats(
// Recursively analyze subdirectories // Recursively analyze subdirectories
const subStats = await getDirectoryStats(itemPath); const subStats = await getDirectoryStats(itemPath);
document_count += subStats.document_count; document_count += subStats.document_count;
unique_document_count += subStats.unique_document_count;
duplicate_count += subStats.duplicate_count;
total_size_bytes += subStats.total_size_bytes; total_size_bytes += subStats.total_size_bytes;
// Merge file type stats // Merge file type stats
for (const [ext, count] of Object.entries(subStats.file_type_stats)) { for (const [ext, count] of Object.entries(subStats.file_type_stats)) {
file_type_stats[ext] = (file_type_stats[ext] || 0) + count; file_type_stats[ext] = (file_type_stats[ext] || 0) + count;
} }
// Merge content hashes to detect duplicates across subdirectories
for (const hash of subStats.contentHashes) {
contentHashes.add(hash);
}
} else { } else {
// Count files as documents // Count files as documents
document_count++; document_count++;
@@ -373,14 +423,53 @@ async function getDirectoryStats(
// Track file extension // Track file extension
const ext = path.extname(item).toLowerCase() || "no-extension"; const ext = path.extname(item).toLowerCase() || "no-extension";
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
// Calculate content hash for image files to detect duplicates
const isImageFile = [
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".webp",
".heic",
".heif",
".tiff",
".tif"
].includes(ext);
if (isImageFile) {
try {
const fileHash = await calculateFileHash(itemPath);
if (contentHashes.has(fileHash)) {
// This is a duplicate - don't count in file_type_stats
duplicate_count++;
} else {
// This is unique
contentHashes.add(fileHash);
unique_document_count++;
// Only count unique files in file_type_stats
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
}
} catch (hashError) {
logger.warning(`Failed to hash file ${itemPath}:`, hashError);
// If hashing fails, count as unique to avoid losing data
unique_document_count++;
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
}
} else {
// Non-image files are counted as unique (not checking for duplicates)
unique_document_count++;
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
}
} }
} }
} catch (error) { } catch (error) {
logger.error(`Error analyzing directory ${dirPath}:`, error); logger.error(`Error analyzing directory ${dirPath}:`, error);
} }
return { document_count, total_size_bytes, file_type_stats }; return { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats, contentHashes };
} }
let bodyshopid: UUID; let bodyshopid: UUID;