/* This is part of oni-ocfl
(c) The University of Queensland 2021
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/* Test for collection.js */
const path = require("node:path");
const fs = require("fs-extra");
const os = require("node:os");
const util = require("node:util");
const execFile = util.promisify(require('node:child_process').execFile);
const Provenance = require("./provenance.js");
const getLogger = require("./common/logger").getLogger;
const workingPath = require("./common").workingPath;
const { ROCrate, validate } = require("ro-crate");
const { createCommand } = require('commander');
const ocfl = require("@ocfl/ocfl-fs");
//const shell = require("shelljs")
const generateArcpId = require("./mint-arcp-id");
const tmp = require("tmp");
const _ = require("lodash");
const assert = require("assert");
//const { opendir } = require('fs').promises;
const ExcelJS = require('exceljs');
const modes = require("ro-crate-modes");
const { Preview, HtmlFile } = require("ro-crate-html");
const PRONOM_URI_BASE = 'https://www.nationalarchives.gov.uk/PRONOM/';
const mainPackage = (function () {
try {
return require(path.join(require.main.path, "package.json"));
} catch (e) { }
return {};
})();
function isRelFilePath(url) {
try {
new URL(url);
return false;
} catch (e) {
if (url.startsWith('#')) return false;
return true;
}
}
// OCFLObject
class CollectionObject {
constructor(parent, crateDir, metaFile) {
this.collector = parent;
const rocrateOpts = { alwaysAsArray: true, resolveLinks: true };
if (crateDir) {
if(metaFile) {
// Load the RO-Crate from memory
this.crate = metaFile;
} else {
// Load the RO-Crate from the specified directory
console.log("CRATE DIR", crateDir)
const metaPath = path.join(crateDir, "ro-crate-metadata.json");
const json = JSON.parse(fs.readFileSync(metaPath));
this.crate = new ROCrate(json, rocrateOpts);
}
} else {
this.crate = new ROCrate({}, rocrateOpts);
}
this.rootDataset = this.crate.root;
fs.ensureDirSync(parent.tempDirPath);
this.__tmpobj = tmp.dirSync({ tmpdir: parent.tempDirPath });
this.dir = this.__tmpobj.name;
this.files = {};
this.dataDir = crateDir;
}
mintArcpId(paths, id) {
if (Array.isArray(paths)) {
if (id) {
paths.push(id)
}
}
if (typeof paths === 'string') {
paths = [paths];
if (id) {
paths.push(id);
}
}
this.id = generateArcpId(this.collector.namespace, paths);
this.rootDataset["@id"] = this.id;
// const metadataDesc = this.crate.getItem(this.crate.defaults.roCrateMetadataID);
// metadataDesc.about = this.rootDataset;
}
/**
* Add a file or directory in the local file system to the queue to be imported to the OCFL repository.
* @param {string} source
* @param {string} target
* @param {object} entityProps If specified, a file entity will be created and added to the crate.
*/
importFile(source, target, entityProps) {
this.files[target] = {
exists: true,
source,
target
};
if (entityProps) {
const fileEntity = {
'@id': target,
'@type': ['File'],
name: target,
...entityProps
};
const types = [].concat(fileEntity['@type']).filter(t => t);
if (!types.includes('File')) {
types.unshift('File');
fileEntity['@type'] = types;
}
if (fileEntity.isPartOf) {
for (const p of [].concat(fileEntity.isPartOf)) {
this.crate.addValues(p, 'hasPart', fileEntity);
}
} else {
this.crate.addValues(this.crate.rootId, 'hasPart', fileEntity);
}
}
}
// Copy a file into an objects temp directory and add it to the crate
async addFile(f, srcDir, filePath, addToRootHasPart) {
// File should be JSCON
//f: IS a JSON-LD item
// subDir is an optional directory under the colllectorDataDir
// addToRootHasPart defaults to true,makes sure all files are linked stucturally as per RO-Crate spec
// Set to false if file is already linked
if (addToRootHasPart != false) {
this.crate.pushValue(this.rootDataset, "hasPart", f);
} else {
this.crate.addItem(f);
}
var srcPath;
if (filePath) {
srcPath = path.join(srcDir, filePath)
} else if (srcDir) {
srcPath = path.join(srcDir, f["@id"])
} else {
srcPath = path.join(this.collector.dataDir, f["@id"]);
}
const destPath = path.join(this.dir, f["@id"]);
if (fs.existsSync(srcPath)) {
await fs.ensureFile(destPath);
f.size = (await fs.stat(srcPath)).size;
//console.log(srcPath, destPath)
await fs.copyFile(srcPath, destPath);
console.log("Copied", srcPath, destPath);
} else {
console.error(`WARNING MISSING FILE: ${srcPath}`);
}
}
/**
* Write data into a file in the temp dir and add it to the crate
* @param {*} f
* @param {*} data
*/
async writeFile(f, data) {
// File should be JSCON
//f: IS a JSON-LD item
// subDir is an optional directory under the colllectorDataDir
this.crate.addValues(this.root, 'hasPart', f);
const destPath = path.join(this.dir, f["@id"]);
await fs.ensureFile(destPath);
await fs.writeFile(destPath, data);
}
async generateHTML() {
// Save an HTML file
return (new HtmlFile(new Preview(this.crate))).render();
}
/**
* Iterate through all the entities in the crate with type File and ensure that the file is imported.
*/
async _processFiles() {
const { dataDir, templateCrateDir, sfData } = this.collector;
const crate = this.crate;
for (let entity of crate.entities()) {
const entityId = entity['@id'];
if (entity['@type'].includes('File') && isRelFilePath(entityId)) {
let file = this.files[entityId];
if (!file) {
const source = path.join(this.dataDir ?? dataDir ?? templateCrateDir, entityId);
this.importFile(source, entityId);
file = this.files[entityId];
}
try {
const stats = await fs.stat(file.source);
file.exists = true;
entity.contentSize = '' + stats.size;
let sfFile = sfData.get(entityId);
if (sfFile) {
const mime = sfFile.matches[0]?.mime;
if (mime) {
let formatID = PRONOM_URI_BASE + sfFile.matches[0]?.id;
crate.addValues(entity, 'encodingFormat', [mime, { '@id': formatID }]);
}
}
crate.addValues(crate.rootId, 'hasPart', { '@id': entityId });
} catch (error) {
if (error.code === 'ENOENT') {
file.exists = false;
console.log(`Warning: File '${file.source}' does not exist.`);
} else {
console.error(error);
}
}
}
}
}
/**
* Create an OCFL object in the repository and add all the required files.
* By default, all the file defined in the File entities in the crate will be imported to the OCFL automatically.
* To avoid that set ignoreFilesInCrate to false, for example, during testing.
* Use the files parameter to manually re-import only some specific files.
* @param {boolean} [ignoreFilesInCrate] if true then existing File entities in the crate will not be automatically added to the OCFL repo,
* and ROCrate validator will not be validating against the files.
* @param {Array.<Array.<string>>} [files] If specified, each file in the array of tuple [source,destination][] will be imported to the OCFL object,
* in addition the to the already existing files added using importFile() method. This is the same as calling importFile() on each of the tuple prior.
*/
async addToRepo(ignoreFilesInCrate = false, files) {
await this.crate.resolveContext();
//TODO: check method and get rid of identifier
//this.crate.addIdentifier({ name: this.collector.repoName, identifier: this.id });
// const localId = `_:local-id:${this.collector.repoName}:${this.id}`;
// const localRepoId = this.crate.hasEntity(localId);
// assert(localRepoId, 'Was not able to add identifier');
this.crate.addEntity(this.collector.prov.scriptTool);
this.crate.addEntity(this.collector.prov.createAction);
if (!ignoreFilesInCrate) {
await this._processFiles();
}
const results = await validate(this.crate, ignoreFilesInCrate ? undefined : this.files);
for (let r of results) {
if (r.status === 'error') {
let message = `Problem while adding to repository for ${this.id} error: ${r.id} : ${r.message}`;
if (r.entity) {
message += `: entity: ${r.entity}`;
}
throw new Error(message);
}
}
try {
// validate crate
let { excelValidator, modeValidator } = this.collector;
if (excelValidator) {
if (typeof excelValidator !== 'string') {
excelValidator = 'ro-crate-validation.xlsx';
}
await validateWithExcel(excelValidator, this.crate);
}
if (modeValidator) {
if (typeof modeValidator !== 'string') {
modeValidator = 'https://language-research-technology.github.io/ro-crate-modes/modes/comprehensive-ldac.json';
}
await validateWithMode(modeValidator, this.crate);
}
// const rocrateFile = path.join(this.dir, "ro-crate-metadata.json");
// const previewFile = path.join(this.dir, "ro-crate-preview.html");
//await fs.writeFile(rocrateFile, JSON.stringify(this.crate, null, 2));
const previewContent = await this.generateHTML();
//await fs.writeFile(previewFile, previewContent);
let object = this.collector.repo.object(this.id);
// let imports = [[this.dir, ""]]
// if (files && fileList.length > 0) {
// imports.push(...fileList);
// }
// await object.import(imports);
await object.update(async (t) => {
await t.write('ro-crate-metadata.json', JSON.stringify(this.crate, null, 2));
await t.write('ro-crate-preview.html', previewContent);
if (this.dir) {
await t.import(this.dir, "");
}
for (const target in this.files) {
const { exists, source } = this.files[target];
if (exists) {
await t.import(source, target);
}
}
if (Array.isArray(files)) {
for (const [source, target] of files) {
await t.import(source, target);
}
}
})
console.log(`Wrote crate ${object}`);
} catch (error) {
console.error(error);
}
console.log(`Deleting crateDir: ${this.dir}`);
fs.rmSync(this.dir, { recursive: true, force: true });
console.log(`Deleted crateDir: ${this.dir}`)
}
}
const defaultTempDir = fs.realpathSync(os.tmpdir());
function getOpts(opts) {
// extraOpts TODO: Array of arrays with extra .options (see below)
const program = createCommand();
program.option('-r, --repo-path <type>', 'Path to OCFL repository')
.option('-n, --repo-name <type>', 'Name of OCFL repository')
.option('-z, --repo-scratch <ns>', 'Path of the scratch ocfl repo')
.option('-s, --namespace <ns>', 'namespace for ARCP IDs')
.option('-c, --collection-name <ns>', 'Name of this collection (if not in template)')
.option('-x, --excel <file>', 'Excel file')
.option('--vx, --validate-with-excel [file]', 'Excel file for validation')
.option('--vm, --validate-with-mode [file]', 'A path or url to the mode file')
.option('-p, --temp-path <dirs>', 'Temporary Directory Path')
.option('-t, --template <dirs>', 'RO-Crate directory on which to base this the RO-Crate metadata file will be used as a base and any files copied in to the new collection crate')
.option('-d, --data-dir <dirs>', "Directory of data files with sub directories '/Sound files' (for .wav) and '/Transcripts' (.csv)")
.option('-D, --debug <ns>', 'Use this in your collector to turn off some behaviour for debugging')
.option('-m, --multiple', 'Output multiple Objects rather than a single object')
.option('--sf [file]', 'Run siegfried on the files in the crate and write the output to a file in the data directory')
program.allowExcessArguments(true);
program.parse(process.argv);
// merge the opts
return { ...program.opts(), ...opts };
}
// Collector is a class for use in building (or adding to) an OCFL repo for a collection of data (eg a linguistic Collector)
class Collector {
static mainPackage = mainPackage;
constructor(opts = {}) {
this.opts = getOpts(opts);
this.excelPath = this.opts.excel;
this.tempDirPath = this.opts.tempPath || defaultTempDir;
this.repoPath = this.opts.repoPath || "../repo";
this.repoScratch = this.opts.repoScratch || "../scratch";
this.repoName = this.opts.repoName || "repository";
this.debug = this.opts.debug;
if (this.debug == "true") { // Force type coercion
this.debug = true;
console.log('\n *** RUNNING IN DEBUG MODE *** \n');
} else {
this.debug = false;
}
this.templateCrateDir = this.opts.template || this.opts.templateCrateDir || this.opts.dataDir || './';
this.dataDir = this.opts.dataDir || this.opts.template || './';
this.excelFile = this.opts.excel;
this.excelValidator = this.opts.validateWithExcel;
this.namespace = this.opts.namespace; // eg "sydney-speaks" or "monash-Collector-of-english"
this.CollectorName = this.opts.CollectorName;
let pkg = Collector.mainPackage;
pkg.inputs = pkg.inputs || this.opts.inputs || this.opts.excel ? { '@id': path.basename(this.opts.excel) } : undefined;
// This is slow so do it now
this.prov = new Provenance(pkg);
this.modeValidator = this.opts.validateWithMode;
this.sf = this.opts.sf != null;
this.sfFileName = (this.sf && typeof this.sf === 'string') ? this.sf : '.siegfried.json';
this.sfData = new Map();
}
async connect() {
this.repo = ocfl.storage({
root: this.repoPath, layout: {
extensionName: '000N-path-direct-storage-layout'
}
});
if (!await fs.pathExists(this.repoPath)) {
console.log("CREATING")
await this.repo.create();
await this.repo.load();
} else {
await this.repo.load();
}
}
static async create(opts = {}) {
const collector = new Collector(opts);
await collector.connect(); // Make or find the OCFL repo
if (collector.sf) {
await collector.runSiegfried();
}
return collector;
}
async runSiegfried(force) {
// read the cache
const sfCachePath = path.join(this.dataDir, this.sfFileName);
let sfJson;
try {
sfJson = await fs.readJson(sfCachePath);
} catch (error) {
if (error.code === 'ENOENT') {
console.log("No cache file found for Siegfried data");
} else {
console.log(error);
}
}
if (!sfJson || force) {
try {
console.log("Running Siegfried on", this.dataDir);
const { stdout } = await execFile('sf', ['-json', this.dataDir]);
sfJson = JSON.parse(stdout);
fs.writeFile(sfCachePath, stdout);
} catch (error) {
console.log(error);
}
}
const sfData = this.sfData;
sfData.clear();
for (const file of sfJson?.files || []) {
sfData.set(path.relative(this.dataDir, file.filename), file);
}
//console.log(sfData);
}
/**
* Create a new collection object for ingestion into the OCFL repository.
* If `crate` is provided, that in-memory ROCrate instance will be used.
* Otherwise, metadata will be loaded from `cratePath/ro-crate-metadata.json`.
* @param {string} cratePath Directory containing `ro-crate-metadata.json`.
* @param {ROCrate} [crate] Optional in-memory ROCrate instance.
* @returns {CollectionObject} A mutable collection object ready to be updated and persisted with `addToRepo()`.
*/
newObject(cratePath, crate) {
return new CollectionObject(this, cratePath, crate);
}
}
const validateWorksheet = {
/**
* Handle properties worksheet
* @param {ExcelJS.Worksheet} ws
* @param {ROCrate} roc
* @param {Array} errors
*/
types(ws, roc, errors) {
const expected = {};
let i, len = ws.rowCount;
for (i = 2; i <= len; ++i) {
const row = ws.getRow(i);
if (row.hasValues) {
const crate = row.getCell('crate').text || 'all';
const type = row.getCell('type').text;
if (type) {
expected[crate] = expected[crate] || {};
expected[crate][type] = row.getCell('count').value;
}
}
}
const realCount = {};
for (const e of roc.entities()) {
for (const t of e['@type']) {
realCount[t] = realCount[t] || 0;
realCount[t]++;
}
}
//console.log(realCount);
// match the count
for (const ec of [expected.all, expected[roc.rootId]].filter(c => c)) {
for (const t in ec) {
if (ec[t] !== realCount[t]) {
errors.push(`[validation] Entities ${t} expected count is ${ec[t]} but actual is ${realCount[t]}`);
}
}
}
},
/**
* Handle properties worksheet
* @param {ExcelJS.Worksheet} ws
* @param {ROCrate} roc
* @param {Array} errors
*/
properties(ws, roc, errors) {
const expected = {};
let i, len = ws.rowCount;
for (i = 2; i <= len; ++i) {
const row = ws.getRow(i);
if (row.hasValues) {
const entity = row.getCell('entity').text;
const property = row.getCell('property').text;
if (entity && property) {
expected[entity] = expected[entity] || {};
expected[entity][property] = [row.getCell('count').value, row.getCell('value').text];
}
}
}
for (const entityId in expected) {
const entity = entityId === './' ? roc.rootDataset : roc.getEntity(entityId);
for (const prop in expected[entityId]) {
const [expectedCount, expectedValue] = expected[entityId][prop];
if (expectedCount != null) {
if (entity[prop].length !== expectedCount) {
errors.push(`[validation][${entityId}.${prop}] Expected value count of ${expectedCount} but got ${entity[prop].length}`);
}
} else if (expectedValue != null) {
if (entity[prop][0] !== expectedValue) {
errors.push(`[validation][${entityId}.${prop}] Expected value '${expectedValue}' but got '${entity[prop][0]}'`);
}
}
}
}
}
};
/**
*
* @param {string} excelFile
* @param {ROCrate} roc
*/
async function validateWithExcel(excelFile, roc) {
console.log('Validating crate using numbers in', excelFile);
// Validate the RO-Crate using ExcelJS
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(excelFile);
let errors = [];
workbook.eachSheet((ws) => {
const hRow = ws.getRow(1);
// assign column name based on 1st row values to allow easy access
hRow.eachCell((cell, colNumber) => {
ws.getColumn(colNumber).key = cell.text.toLowerCase();
});
try {
validateWorksheet[ws.name](ws, roc, errors);
} catch (e) {
console.error(e);
errors.push(`[validation] Please check ${excelFile} file. Format error in worksheet ${ws.name}`);
}
});
if (errors.length) {
console.log(errors);
throw new Error("Metadata stat does not pass validation");
}
}
/**
*
* @param {string} modeFile A path or URL to mode file
* @param {ROCrate} roc
*/
async function validateWithMode(modeFile, roc) {
console.log('Validating crate using mode', modeFile);
let errors = [];
let mode;
try {
let res = await fetch(modeFile);
if (res.ok) {
mode = await res.json();
}
} catch (error) {
mode = fs.readJsonSync(modeFile);
}
if (!mode) return;
let results = modes.validate(mode, roc);
for (let entityId in results) {
for (let propId in results[entityId].props) {
//errors.push(`[validation][mode][${entityId}] is missing required input ${input.name} of type ${input.type}`);
let prop = results[entityId].props[propId];
errors.push(...prop.errors.map(e => `[validation][mode][${entityId}][${prop.name}] ${e.description}`));
}
}
if (errors.length) {
console.log(errors);
throw new Error("ro-crate-metadata does not pass mode validation");
}
}
module.exports = Collector;