update unique counts.
This commit is contained in:
103
util/s3Sync.ts
103
util/s3Sync.ts
@@ -8,6 +8,8 @@ import { FolderPaths } from "./serverInit.js";
|
||||
import axios from "axios";
|
||||
import { UUID } from "crypto";
|
||||
import fsPromises from "fs/promises";
|
||||
import crypto from "crypto";
|
||||
import { createReadStream } from "fs";
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
@@ -24,6 +26,8 @@ export interface JobFolderStats {
|
||||
jobid: UUID | string | null;
|
||||
//relativePath: string;
|
||||
document_count: number;
|
||||
unique_document_count: number;
|
||||
duplicate_count: number;
|
||||
total_size_bytes: number;
|
||||
total_size_mb: number;
|
||||
file_type_stats: { [extension: string]: number };
|
||||
@@ -33,6 +37,8 @@ export interface JobsDirectoryAnalysis {
|
||||
bodyshopid: UUID;
|
||||
total_jobs: number;
|
||||
total_documents: number;
|
||||
unique_documents: number;
|
||||
duplicate_documents: number;
|
||||
total_size_bytes: number;
|
||||
total_size_mb: number;
|
||||
file_type_stats: { [extension: string]: number };
|
||||
@@ -203,6 +209,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
||||
bodyshopid,
|
||||
total_jobs: 0,
|
||||
total_documents: 0,
|
||||
unique_documents: 0,
|
||||
duplicate_documents: 0,
|
||||
total_size_bytes: 0,
|
||||
total_size_mb: 0,
|
||||
file_type_stats: {},
|
||||
@@ -216,6 +224,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
||||
|
||||
const jobFolders = await readdir(jobsPath);
|
||||
let total_documents = 0;
|
||||
let total_unique_documents = 0;
|
||||
let total_duplicate_documents = 0;
|
||||
let total_size_bytes = 0;
|
||||
let total_jobs = 0;
|
||||
const aggregated_file_type_stats: { [extension: string]: number } = {};
|
||||
@@ -237,6 +247,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
||||
|
||||
total_jobs++;
|
||||
total_documents += folderStats.document_count;
|
||||
total_unique_documents += folderStats.unique_document_count;
|
||||
total_duplicate_documents += folderStats.duplicate_count;
|
||||
total_size_bytes += folderStats.total_size_bytes;
|
||||
|
||||
// Aggregate file type stats
|
||||
@@ -268,6 +280,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
||||
bodyshopid, //read from the config.json file in the root directory
|
||||
total_jobs,
|
||||
total_documents,
|
||||
unique_documents: total_unique_documents,
|
||||
duplicate_documents: total_duplicate_documents,
|
||||
total_size_bytes,
|
||||
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
|
||||
file_type_stats: aggregated_file_type_stats,
|
||||
@@ -275,7 +289,7 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
||||
};
|
||||
|
||||
logger.info(
|
||||
`Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents, ${analysis.total_size_mb} MB`
|
||||
`Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents (${analysis.unique_documents} unique, ${analysis.duplicate_documents} duplicates), ${analysis.total_size_mb} MB`
|
||||
);
|
||||
|
||||
//Add an upload to the IO database to categorize all of this.
|
||||
@@ -311,7 +325,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
||||
async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUID): Promise<JobFolderStats> {
|
||||
const jobFolderPath = path.join(jobsPath, jobid);
|
||||
|
||||
const { document_count, total_size_bytes, file_type_stats } = await getDirectoryStats(jobFolderPath);
|
||||
const { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats } =
|
||||
await getDirectoryStats(jobFolderPath);
|
||||
|
||||
const uuidRegex = /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/;
|
||||
let validJobid: UUID | string | null = null;
|
||||
@@ -327,6 +342,8 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
|
||||
bodyshopid,
|
||||
jobid: validJobid,
|
||||
document_count,
|
||||
unique_document_count,
|
||||
duplicate_count,
|
||||
total_size_bytes,
|
||||
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
|
||||
file_type_stats
|
||||
@@ -334,14 +351,40 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively get document count and total size for a directory (helper function)
|
||||
* Calculate SHA256 hash of a file's content
|
||||
*/
|
||||
async function getDirectoryStats(
|
||||
dirPath: string
|
||||
): Promise<{ document_count: number; total_size_bytes: number; file_type_stats: { [extension: string]: number } }> {
|
||||
async function calculateFileHash(filePath: string): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const hash = crypto.createHash("sha256");
|
||||
const stream = createReadStream(filePath);
|
||||
|
||||
stream.on("data", (data: string | Buffer) => hash.update(data));
|
||||
stream.on("end", () => resolve(hash.digest("hex")));
|
||||
stream.on("error", (error: Error) => {
|
||||
console.error("Hashing error:", error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively get document count and total size for a directory (helper function)
|
||||
* Now with duplicate detection using content-based hashing
|
||||
*/
|
||||
async function getDirectoryStats(dirPath: string): Promise<{
|
||||
document_count: number;
|
||||
unique_document_count: number;
|
||||
duplicate_count: number;
|
||||
total_size_bytes: number;
|
||||
file_type_stats: { [extension: string]: number };
|
||||
contentHashes: Set<string>;
|
||||
}> {
|
||||
let document_count = 0;
|
||||
let unique_document_count = 0;
|
||||
let duplicate_count = 0;
|
||||
let total_size_bytes = 0;
|
||||
const file_type_stats: { [extension: string]: number } = {};
|
||||
const contentHashes = new Set<string>();
|
||||
|
||||
try {
|
||||
const items = await readdir(dirPath);
|
||||
@@ -360,12 +403,19 @@ async function getDirectoryStats(
|
||||
// Recursively analyze subdirectories
|
||||
const subStats = await getDirectoryStats(itemPath);
|
||||
document_count += subStats.document_count;
|
||||
unique_document_count += subStats.unique_document_count;
|
||||
duplicate_count += subStats.duplicate_count;
|
||||
total_size_bytes += subStats.total_size_bytes;
|
||||
|
||||
// Merge file type stats
|
||||
for (const [ext, count] of Object.entries(subStats.file_type_stats)) {
|
||||
file_type_stats[ext] = (file_type_stats[ext] || 0) + count;
|
||||
}
|
||||
|
||||
// Merge content hashes to detect duplicates across subdirectories
|
||||
for (const hash of subStats.contentHashes) {
|
||||
contentHashes.add(hash);
|
||||
}
|
||||
} else {
|
||||
// Count files as documents
|
||||
document_count++;
|
||||
@@ -373,14 +423,53 @@ async function getDirectoryStats(
|
||||
|
||||
// Track file extension
|
||||
const ext = path.extname(item).toLowerCase() || "no-extension";
|
||||
|
||||
// Calculate content hash for image files to detect duplicates
|
||||
const isImageFile = [
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
".bmp",
|
||||
".webp",
|
||||
".heic",
|
||||
".heif",
|
||||
".tiff",
|
||||
".tif"
|
||||
].includes(ext);
|
||||
|
||||
if (isImageFile) {
|
||||
try {
|
||||
const fileHash = await calculateFileHash(itemPath);
|
||||
|
||||
if (contentHashes.has(fileHash)) {
|
||||
// This is a duplicate - don't count in file_type_stats
|
||||
duplicate_count++;
|
||||
} else {
|
||||
// This is unique
|
||||
contentHashes.add(fileHash);
|
||||
unique_document_count++;
|
||||
// Only count unique files in file_type_stats
|
||||
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
|
||||
}
|
||||
} catch (hashError) {
|
||||
logger.warning(`Failed to hash file ${itemPath}:`, hashError);
|
||||
// If hashing fails, count as unique to avoid losing data
|
||||
unique_document_count++;
|
||||
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
|
||||
}
|
||||
} else {
|
||||
// Non-image files are counted as unique (not checking for duplicates)
|
||||
unique_document_count++;
|
||||
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Error analyzing directory ${dirPath}:`, error);
|
||||
}
|
||||
|
||||
return { document_count, total_size_bytes, file_type_stats };
|
||||
return { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats, contentHashes };
|
||||
}
|
||||
|
||||
let bodyshopid: UUID;
|
||||
|
||||
Reference in New Issue
Block a user