handled UTF-8 BOM in CSV input

This commit is contained in:
SPRINX0\prochazka
2025-12-04 16:44:08 +01:00
parent 23cf264d4d
commit 89121a2608

View File

@@ -6,6 +6,56 @@ const lineReader = require('line-reader');
let dbgateApi;
class StripUtf8BomTransform extends stream.Transform {
constructor(options) {
super(options);
this._checkedBOM = false;
this._pending = Buffer.alloc(0); // store initial bytes until we know if BOM is present
}
_transform(chunk, encoding, callback) {
if (this._checkedBOM) {
// We already handled BOM decision, just pass through
this.push(chunk);
return callback();
}
// Accumulate into pending until we can decide
this._pending = Buffer.concat([this._pending, chunk]);
if (this._pending.length < 3) {
// Still don't know if it's BOM or not (need at least 3 bytes)
return callback();
}
// Now we can check the first 3 bytes
const BOM = [0xef, 0xbb, 0xbf];
const hasBom = this._pending[0] === BOM[0] && this._pending[1] === BOM[1] && this._pending[2] === BOM[2];
if (hasBom) {
// Drop the BOM, push the rest
this.push(this._pending.slice(3));
} else {
// No BOM, push everything as-is
this.push(this._pending);
}
this._pending = Buffer.alloc(0);
this._checkedBOM = true;
callback();
}
_flush(callback) {
// Stream ended but we never had enough bytes to decide (length < 3)
if (!this._checkedBOM && this._pending.length > 0) {
// If it's less than 3 bytes, it can't be a UTF-8 BOM, so just pass it through
this.push(this._pending);
}
this._pending = Buffer.alloc(0);
callback();
}
}
function readFirstLine(file) {
return new Promise((resolve, reject) => {
lineReader.open(file, (err, reader) => {
@@ -95,7 +145,7 @@ async function reader({ fileName, encoding = 'utf-8', header = true, delimiter,
});
const fileStream = fs.createReadStream(downloadedFile, encoding);
const csvPrepare = new CsvPrepareStream({ header });
return [fileStream, csvStream, csvPrepare];
return [fileStream, new StripUtf8BomTransform(), csvStream, csvPrepare];
// fileStream.pipe(csvStream);
// csvStream.pipe(csvPrepare);
// return csvPrepare;