update unique counts.

This commit is contained in:
Patrick Fic
2025-11-17 16:28:40 -08:00
parent e8da1a6c26
commit 2ca135fd55

View File

@@ -8,6 +8,8 @@ import { FolderPaths } from "./serverInit.js";
import axios from "axios";
import { UUID } from "crypto";
import fsPromises from "fs/promises";
import crypto from "crypto";
import { createReadStream } from "fs";
const execAsync = promisify(exec);
@@ -24,6 +26,8 @@ export interface JobFolderStats {
jobid: UUID | string | null;
//relativePath: string;
document_count: number;
unique_document_count: number;
duplicate_count: number;
total_size_bytes: number;
total_size_mb: number;
file_type_stats: { [extension: string]: number };
@@ -33,6 +37,8 @@ export interface JobsDirectoryAnalysis {
bodyshopid: UUID;
total_jobs: number;
total_documents: number;
unique_documents: number;
duplicate_documents: number;
total_size_bytes: number;
total_size_mb: number;
file_type_stats: { [extension: string]: number };
@@ -203,6 +209,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
bodyshopid,
total_jobs: 0,
total_documents: 0,
unique_documents: 0,
duplicate_documents: 0,
total_size_bytes: 0,
total_size_mb: 0,
file_type_stats: {},
@@ -216,6 +224,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
const jobFolders = await readdir(jobsPath);
let total_documents = 0;
let total_unique_documents = 0;
let total_duplicate_documents = 0;
let total_size_bytes = 0;
let total_jobs = 0;
const aggregated_file_type_stats: { [extension: string]: number } = {};
@@ -237,6 +247,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
total_jobs++;
total_documents += folderStats.document_count;
total_unique_documents += folderStats.unique_document_count;
total_duplicate_documents += folderStats.duplicate_count;
total_size_bytes += folderStats.total_size_bytes;
// Aggregate file type stats
@@ -268,6 +280,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
bodyshopid, //read from the config.json file in the root directory
total_jobs,
total_documents,
unique_documents: total_unique_documents,
duplicate_documents: total_duplicate_documents,
total_size_bytes,
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
file_type_stats: aggregated_file_type_stats,
@@ -275,7 +289,7 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
};
logger.info(
`Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents, ${analysis.total_size_mb} MB`
`Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents (${analysis.unique_documents} unique, ${analysis.duplicate_documents} duplicates), ${analysis.total_size_mb} MB`
);
//Add an upload to the IO database to categorize all of this.
@@ -311,7 +325,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUID): Promise<JobFolderStats> {
const jobFolderPath = path.join(jobsPath, jobid);
const { document_count, total_size_bytes, file_type_stats } = await getDirectoryStats(jobFolderPath);
const { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats } =
await getDirectoryStats(jobFolderPath);
const uuidRegex = /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/;
let validJobid: UUID | string | null = null;
@@ -327,6 +342,8 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
bodyshopid,
jobid: validJobid,
document_count,
unique_document_count,
duplicate_count,
total_size_bytes,
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
file_type_stats
@@ -334,14 +351,40 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
}
/**
* Recursively get document count and total size for a directory (helper function)
* Calculate SHA256 hash of a file's content
*/
async function getDirectoryStats(
dirPath: string
): Promise<{ document_count: number; total_size_bytes: number; file_type_stats: { [extension: string]: number } }> {
async function calculateFileHash(filePath: string): Promise<string> {
return new Promise((resolve, reject) => {
const hash = crypto.createHash("sha256");
const stream = createReadStream(filePath);
stream.on("data", (data: string | Buffer) => hash.update(data));
stream.on("end", () => resolve(hash.digest("hex")));
stream.on("error", (error: Error) => {
console.error("Hashing error:", error);
reject(error);
});
});
}
/**
* Recursively get document count and total size for a directory (helper function)
* Now with duplicate detection using content-based hashing
*/
async function getDirectoryStats(dirPath: string): Promise<{
document_count: number;
unique_document_count: number;
duplicate_count: number;
total_size_bytes: number;
file_type_stats: { [extension: string]: number };
contentHashes: Set<string>;
}> {
let document_count = 0;
let unique_document_count = 0;
let duplicate_count = 0;
let total_size_bytes = 0;
const file_type_stats: { [extension: string]: number } = {};
const contentHashes = new Set<string>();
try {
const items = await readdir(dirPath);
@@ -360,12 +403,19 @@ async function getDirectoryStats(
// Recursively analyze subdirectories
const subStats = await getDirectoryStats(itemPath);
document_count += subStats.document_count;
unique_document_count += subStats.unique_document_count;
duplicate_count += subStats.duplicate_count;
total_size_bytes += subStats.total_size_bytes;
// Merge file type stats
for (const [ext, count] of Object.entries(subStats.file_type_stats)) {
file_type_stats[ext] = (file_type_stats[ext] || 0) + count;
}
// Merge content hashes to detect duplicates across subdirectories
for (const hash of subStats.contentHashes) {
contentHashes.add(hash);
}
} else {
// Count files as documents
document_count++;
@@ -373,14 +423,53 @@ async function getDirectoryStats(
// Track file extension
const ext = path.extname(item).toLowerCase() || "no-extension";
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
// Calculate content hash for image files to detect duplicates
const isImageFile = [
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".webp",
".heic",
".heif",
".tiff",
".tif"
].includes(ext);
if (isImageFile) {
try {
const fileHash = await calculateFileHash(itemPath);
if (contentHashes.has(fileHash)) {
// This is a duplicate - don't count in file_type_stats
duplicate_count++;
} else {
// This is unique
contentHashes.add(fileHash);
unique_document_count++;
// Only count unique files in file_type_stats
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
}
} catch (hashError) {
logger.warning(`Failed to hash file ${itemPath}:`, hashError);
// If hashing fails, count as unique to avoid losing data
unique_document_count++;
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
}
} else {
// Non-image files are counted as unique (not checking for duplicates)
unique_document_count++;
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
}
}
}
} catch (error) {
logger.error(`Error analyzing directory ${dirPath}:`, error);
}
return { document_count, total_size_bytes, file_type_stats };
return { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats, contentHashes };
}
let bodyshopid: UUID;