update unique counts.
This commit is contained in:
107
util/s3Sync.ts
107
util/s3Sync.ts
@@ -8,6 +8,8 @@ import { FolderPaths } from "./serverInit.js";
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { UUID } from "crypto";
|
import { UUID } from "crypto";
|
||||||
import fsPromises from "fs/promises";
|
import fsPromises from "fs/promises";
|
||||||
|
import crypto from "crypto";
|
||||||
|
import { createReadStream } from "fs";
|
||||||
|
|
||||||
const execAsync = promisify(exec);
|
const execAsync = promisify(exec);
|
||||||
|
|
||||||
@@ -24,6 +26,8 @@ export interface JobFolderStats {
|
|||||||
jobid: UUID | string | null;
|
jobid: UUID | string | null;
|
||||||
//relativePath: string;
|
//relativePath: string;
|
||||||
document_count: number;
|
document_count: number;
|
||||||
|
unique_document_count: number;
|
||||||
|
duplicate_count: number;
|
||||||
total_size_bytes: number;
|
total_size_bytes: number;
|
||||||
total_size_mb: number;
|
total_size_mb: number;
|
||||||
file_type_stats: { [extension: string]: number };
|
file_type_stats: { [extension: string]: number };
|
||||||
@@ -33,6 +37,8 @@ export interface JobsDirectoryAnalysis {
|
|||||||
bodyshopid: UUID;
|
bodyshopid: UUID;
|
||||||
total_jobs: number;
|
total_jobs: number;
|
||||||
total_documents: number;
|
total_documents: number;
|
||||||
|
unique_documents: number;
|
||||||
|
duplicate_documents: number;
|
||||||
total_size_bytes: number;
|
total_size_bytes: number;
|
||||||
total_size_mb: number;
|
total_size_mb: number;
|
||||||
file_type_stats: { [extension: string]: number };
|
file_type_stats: { [extension: string]: number };
|
||||||
@@ -203,6 +209,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
|||||||
bodyshopid,
|
bodyshopid,
|
||||||
total_jobs: 0,
|
total_jobs: 0,
|
||||||
total_documents: 0,
|
total_documents: 0,
|
||||||
|
unique_documents: 0,
|
||||||
|
duplicate_documents: 0,
|
||||||
total_size_bytes: 0,
|
total_size_bytes: 0,
|
||||||
total_size_mb: 0,
|
total_size_mb: 0,
|
||||||
file_type_stats: {},
|
file_type_stats: {},
|
||||||
@@ -216,6 +224,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
|||||||
|
|
||||||
const jobFolders = await readdir(jobsPath);
|
const jobFolders = await readdir(jobsPath);
|
||||||
let total_documents = 0;
|
let total_documents = 0;
|
||||||
|
let total_unique_documents = 0;
|
||||||
|
let total_duplicate_documents = 0;
|
||||||
let total_size_bytes = 0;
|
let total_size_bytes = 0;
|
||||||
let total_jobs = 0;
|
let total_jobs = 0;
|
||||||
const aggregated_file_type_stats: { [extension: string]: number } = {};
|
const aggregated_file_type_stats: { [extension: string]: number } = {};
|
||||||
@@ -237,6 +247,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
|||||||
|
|
||||||
total_jobs++;
|
total_jobs++;
|
||||||
total_documents += folderStats.document_count;
|
total_documents += folderStats.document_count;
|
||||||
|
total_unique_documents += folderStats.unique_document_count;
|
||||||
|
total_duplicate_documents += folderStats.duplicate_count;
|
||||||
total_size_bytes += folderStats.total_size_bytes;
|
total_size_bytes += folderStats.total_size_bytes;
|
||||||
|
|
||||||
// Aggregate file type stats
|
// Aggregate file type stats
|
||||||
@@ -268,6 +280,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
|||||||
bodyshopid, //read from the config.json file in the root directory
|
bodyshopid, //read from the config.json file in the root directory
|
||||||
total_jobs,
|
total_jobs,
|
||||||
total_documents,
|
total_documents,
|
||||||
|
unique_documents: total_unique_documents,
|
||||||
|
duplicate_documents: total_duplicate_documents,
|
||||||
total_size_bytes,
|
total_size_bytes,
|
||||||
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
|
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
|
||||||
file_type_stats: aggregated_file_type_stats,
|
file_type_stats: aggregated_file_type_stats,
|
||||||
@@ -275,9 +289,9 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
`Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents, ${analysis.total_size_mb} MB`
|
`Jobs directory analysis complete: ${analysis.total_jobs} jobs, ${analysis.total_documents} documents (${analysis.unique_documents} unique, ${analysis.duplicate_documents} duplicates), ${analysis.total_size_mb} MB`
|
||||||
);
|
);
|
||||||
|
|
||||||
//Add an upload to the IO database to categorize all of this.
|
//Add an upload to the IO database to categorize all of this.
|
||||||
const apiURL = process.env.IS_TEST
|
const apiURL = process.env.IS_TEST
|
||||||
? "https://api.test.imex.online/analytics/documents"
|
? "https://api.test.imex.online/analytics/documents"
|
||||||
@@ -311,7 +325,8 @@ export async function analyzeJobsDirectory(): Promise<JobsDirectoryAnalysis> {
|
|||||||
async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUID): Promise<JobFolderStats> {
|
async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUID): Promise<JobFolderStats> {
|
||||||
const jobFolderPath = path.join(jobsPath, jobid);
|
const jobFolderPath = path.join(jobsPath, jobid);
|
||||||
|
|
||||||
const { document_count, total_size_bytes, file_type_stats } = await getDirectoryStats(jobFolderPath);
|
const { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats } =
|
||||||
|
await getDirectoryStats(jobFolderPath);
|
||||||
|
|
||||||
const uuidRegex = /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/;
|
const uuidRegex = /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$/;
|
||||||
let validJobid: UUID | string | null = null;
|
let validJobid: UUID | string | null = null;
|
||||||
@@ -327,6 +342,8 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
|
|||||||
bodyshopid,
|
bodyshopid,
|
||||||
jobid: validJobid,
|
jobid: validJobid,
|
||||||
document_count,
|
document_count,
|
||||||
|
unique_document_count,
|
||||||
|
duplicate_count,
|
||||||
total_size_bytes,
|
total_size_bytes,
|
||||||
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
|
total_size_mb: Math.round((total_size_bytes / (1024 * 1024)) * 100) / 100,
|
||||||
file_type_stats
|
file_type_stats
|
||||||
@@ -334,14 +351,40 @@ async function analyzeJobFolder(jobsPath: string, jobid: string, bodyshopid: UUI
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Recursively get document count and total size for a directory (helper function)
|
* Calculate SHA256 hash of a file's content
|
||||||
*/
|
*/
|
||||||
async function getDirectoryStats(
|
async function calculateFileHash(filePath: string): Promise<string> {
|
||||||
dirPath: string
|
return new Promise((resolve, reject) => {
|
||||||
): Promise<{ document_count: number; total_size_bytes: number; file_type_stats: { [extension: string]: number } }> {
|
const hash = crypto.createHash("sha256");
|
||||||
|
const stream = createReadStream(filePath);
|
||||||
|
|
||||||
|
stream.on("data", (data: string | Buffer) => hash.update(data));
|
||||||
|
stream.on("end", () => resolve(hash.digest("hex")));
|
||||||
|
stream.on("error", (error: Error) => {
|
||||||
|
console.error("Hashing error:", error);
|
||||||
|
reject(error);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recursively get document count and total size for a directory (helper function)
|
||||||
|
* Now with duplicate detection using content-based hashing
|
||||||
|
*/
|
||||||
|
async function getDirectoryStats(dirPath: string): Promise<{
|
||||||
|
document_count: number;
|
||||||
|
unique_document_count: number;
|
||||||
|
duplicate_count: number;
|
||||||
|
total_size_bytes: number;
|
||||||
|
file_type_stats: { [extension: string]: number };
|
||||||
|
contentHashes: Set<string>;
|
||||||
|
}> {
|
||||||
let document_count = 0;
|
let document_count = 0;
|
||||||
|
let unique_document_count = 0;
|
||||||
|
let duplicate_count = 0;
|
||||||
let total_size_bytes = 0;
|
let total_size_bytes = 0;
|
||||||
const file_type_stats: { [extension: string]: number } = {};
|
const file_type_stats: { [extension: string]: number } = {};
|
||||||
|
const contentHashes = new Set<string>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const items = await readdir(dirPath);
|
const items = await readdir(dirPath);
|
||||||
@@ -360,12 +403,19 @@ async function getDirectoryStats(
|
|||||||
// Recursively analyze subdirectories
|
// Recursively analyze subdirectories
|
||||||
const subStats = await getDirectoryStats(itemPath);
|
const subStats = await getDirectoryStats(itemPath);
|
||||||
document_count += subStats.document_count;
|
document_count += subStats.document_count;
|
||||||
|
unique_document_count += subStats.unique_document_count;
|
||||||
|
duplicate_count += subStats.duplicate_count;
|
||||||
total_size_bytes += subStats.total_size_bytes;
|
total_size_bytes += subStats.total_size_bytes;
|
||||||
|
|
||||||
// Merge file type stats
|
// Merge file type stats
|
||||||
for (const [ext, count] of Object.entries(subStats.file_type_stats)) {
|
for (const [ext, count] of Object.entries(subStats.file_type_stats)) {
|
||||||
file_type_stats[ext] = (file_type_stats[ext] || 0) + count;
|
file_type_stats[ext] = (file_type_stats[ext] || 0) + count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Merge content hashes to detect duplicates across subdirectories
|
||||||
|
for (const hash of subStats.contentHashes) {
|
||||||
|
contentHashes.add(hash);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Count files as documents
|
// Count files as documents
|
||||||
document_count++;
|
document_count++;
|
||||||
@@ -373,14 +423,53 @@ async function getDirectoryStats(
|
|||||||
|
|
||||||
// Track file extension
|
// Track file extension
|
||||||
const ext = path.extname(item).toLowerCase() || "no-extension";
|
const ext = path.extname(item).toLowerCase() || "no-extension";
|
||||||
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
|
|
||||||
|
// Calculate content hash for image files to detect duplicates
|
||||||
|
const isImageFile = [
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
|
".png",
|
||||||
|
".gif",
|
||||||
|
".bmp",
|
||||||
|
".webp",
|
||||||
|
".heic",
|
||||||
|
".heif",
|
||||||
|
".tiff",
|
||||||
|
".tif"
|
||||||
|
].includes(ext);
|
||||||
|
|
||||||
|
if (isImageFile) {
|
||||||
|
try {
|
||||||
|
const fileHash = await calculateFileHash(itemPath);
|
||||||
|
|
||||||
|
if (contentHashes.has(fileHash)) {
|
||||||
|
// This is a duplicate - don't count in file_type_stats
|
||||||
|
duplicate_count++;
|
||||||
|
} else {
|
||||||
|
// This is unique
|
||||||
|
contentHashes.add(fileHash);
|
||||||
|
unique_document_count++;
|
||||||
|
// Only count unique files in file_type_stats
|
||||||
|
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
|
||||||
|
}
|
||||||
|
} catch (hashError) {
|
||||||
|
logger.warning(`Failed to hash file ${itemPath}:`, hashError);
|
||||||
|
// If hashing fails, count as unique to avoid losing data
|
||||||
|
unique_document_count++;
|
||||||
|
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Non-image files are counted as unique (not checking for duplicates)
|
||||||
|
unique_document_count++;
|
||||||
|
file_type_stats[ext] = (file_type_stats[ext] || 0) + 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error analyzing directory ${dirPath}:`, error);
|
logger.error(`Error analyzing directory ${dirPath}:`, error);
|
||||||
}
|
}
|
||||||
|
|
||||||
return { document_count, total_size_bytes, file_type_stats };
|
return { document_count, unique_document_count, duplicate_count, total_size_bytes, file_type_stats, contentHashes };
|
||||||
}
|
}
|
||||||
|
|
||||||
let bodyshopid: UUID;
|
let bodyshopid: UUID;
|
||||||
|
|||||||
Reference in New Issue
Block a user