Files
dbgate/packages/datalib/src/chartProcessor.ts
2025-06-19 12:08:06 +00:00

396 lines
12 KiB
TypeScript

import {
ChartAvailableColumn,
ChartDateParsed,
ChartDefinition,
ChartLimits,
ProcessedChart,
} from './chartDefinitions';
import _sortBy from 'lodash/sortBy';
import _sum from 'lodash/sum';
import {
aggregateChartNumericValuesFromSource,
autoAggregateCompactTimelineChart,
computeChartBucketCardinality,
computeChartBucketKey,
fillChartTimelineBuckets,
tryParseChartDate,
} from './chartTools';
import { getChartScore, getChartYFieldScore } from './chartScoring';
export class ChartProcessor {
chartsProcessing: ProcessedChart[] = [];
charts: ProcessedChart[] = [];
availableColumnsDict: { [field: string]: ChartAvailableColumn } = {};
availableColumns: ChartAvailableColumn[] = [];
autoDetectCharts = false;
rowsAdded = 0;
errorMessage?: string;
constructor(public givenDefinitions: ChartDefinition[] = []) {
for (const definition of givenDefinitions) {
this.chartsProcessing.push({
definition,
rowsAdded: 0,
bucketKeysOrdered: [],
buckets: {},
bucketKeyDateParsed: {},
isGivenDefinition: true,
invalidXRows: 0,
invalidYRows: {},
availableColumns: [],
validYRows: {},
topDistinctValues: {},
});
}
this.autoDetectCharts = this.givenDefinitions.length == 0;
}
// findOrCreateChart(definition: ChartDefinition, isGivenDefinition: boolean): ProcessedChart {
// const signatureItems = [
// definition.chartType,
// definition.xdef.field,
// definition.xdef.transformFunction,
// definition.ydefs.map(y => y.field).join(','),
// ];
// const signature = signatureItems.join('::');
// if (this.chartsBySignature[signature]) {
// return this.chartsBySignature[signature];
// }
// const chart: ProcessedChart = {
// definition,
// rowsAdded: 0,
// bucketKeysOrdered: [],
// buckets: {},
// bucketKeyDateParsed: {},
// isGivenDefinition,
// };
// this.chartsBySignature[signature] = chart;
// return chart;
// }
addRow(row: any) {
const dateColumns: { [key: string]: ChartDateParsed } = {};
const numericColumns: { [key: string]: number } = {};
const numericColumnsForAutodetect: { [key: string]: number } = {};
const stringColumns: { [key: string]: string } = {};
for (const [key, value] of Object.entries(row)) {
const number: number = typeof value == 'string' ? Number(value) : typeof value == 'number' ? value : NaN;
this.availableColumnsDict[key] = {
field: key,
};
const keyLower = key.toLowerCase();
const keyIsId = keyLower.endsWith('_id') || keyLower == 'id' || key.endsWith('Id');
const parsedDate = tryParseChartDate(value);
if (parsedDate) {
dateColumns[key] = parsedDate;
continue;
}
if (!isNaN(number) && isFinite(number)) {
numericColumns[key] = number;
if (!keyIsId) {
numericColumnsForAutodetect[key] = number; // for auto-detecting charts
}
continue;
}
if (typeof value === 'string' && isNaN(number) && value.length < 100) {
stringColumns[key] = value;
}
}
// const sortedNumericColumnns = Object.keys(numericColumns).sort();
if (this.autoDetectCharts) {
// create charts from data, if there are no given definitions
for (const datecol in dateColumns) {
let usedChart = this.chartsProcessing.find(
chart =>
!chart.isGivenDefinition &&
chart.definition.xdef.field === datecol &&
chart.definition.xdef.transformFunction?.startsWith('date:')
);
if (
!usedChart &&
(this.rowsAdded < ChartLimits.APPLY_LIMIT_AFTER_ROWS ||
this.chartsProcessing.length < ChartLimits.AUTODETECT_CHART_LIMIT)
) {
usedChart = {
definition: {
chartType: 'line',
xdef: {
field: datecol,
transformFunction: 'date:day',
},
ydefs: [],
},
rowsAdded: 0,
bucketKeysOrdered: [],
buckets: {},
bucketKeyDateParsed: {},
isGivenDefinition: false,
invalidXRows: 0,
invalidYRows: {},
availableColumns: [],
validYRows: {},
topDistinctValues: {},
};
this.chartsProcessing.push(usedChart);
}
for (const [key, value] of Object.entries(row)) {
if (value == null) continue;
if (key == datecol) continue; // skip date column itself
let existingYDef = usedChart.definition.ydefs.find(y => y.field === key);
if (
!existingYDef &&
(this.rowsAdded < ChartLimits.APPLY_LIMIT_AFTER_ROWS ||
usedChart.definition.ydefs.length < ChartLimits.AUTODETECT_MEASURES_LIMIT)
) {
existingYDef = {
field: key,
aggregateFunction: 'sum',
};
usedChart.definition.ydefs.push(existingYDef);
}
}
}
}
// apply on all charts with this date column
for (const chart of this.chartsProcessing) {
if (chart.errorMessage) {
continue; // skip charts with errors
}
this.applyRawData(
chart,
row,
dateColumns[chart.definition.xdef.field],
chart.isGivenDefinition ? numericColumns : numericColumnsForAutodetect,
stringColumns
);
if (Object.keys(chart.buckets).length > ChartLimits.CHART_FILL_LIMIT) {
chart.errorMessage = `Chart has too many buckets, limit is ${ChartLimits.CHART_FILL_LIMIT}.`;
}
}
for (let i = 0; i < this.chartsProcessing.length; i++) {
if (this.chartsProcessing[i].errorMessage) {
continue; // skip charts with errors
}
this.chartsProcessing[i] = autoAggregateCompactTimelineChart(this.chartsProcessing[i]);
}
this.rowsAdded += 1;
if (this.rowsAdded == ChartLimits.APPLY_LIMIT_AFTER_ROWS) {
this.applyLimitsOnCharts();
}
}
applyLimitsOnCharts() {
const autodetectProcessingCharts = this.chartsProcessing.filter(chart => !chart.isGivenDefinition);
if (autodetectProcessingCharts.length > ChartLimits.AUTODETECT_CHART_LIMIT) {
const newAutodetectProcessingCharts = _sortBy(
this.chartsProcessing.slice(0, ChartLimits.AUTODETECT_CHART_LIMIT),
chart => -getChartScore(chart)
);
for (const chart of autodetectProcessingCharts) {
chart.definition.ydefs = _sortBy(chart.definition.ydefs, yfield => -getChartYFieldScore(chart, yfield)).slice(
0,
ChartLimits.AUTODETECT_MEASURES_LIMIT
);
}
this.chartsProcessing = [
...this.chartsProcessing.filter(chart => chart.isGivenDefinition),
...newAutodetectProcessingCharts,
];
}
}
addRows(...rows: any[]) {
for (const row of rows) {
this.addRow(row);
}
}
finalize() {
this.applyLimitsOnCharts();
this.availableColumns = Object.values(this.availableColumnsDict);
for (const chart of this.chartsProcessing) {
if (chart.errorMessage) {
this.charts.push(chart);
continue;
}
let addedChart: ProcessedChart = chart;
if (chart.rowsAdded == 0) {
continue; // skip empty charts
}
const sortOrder = chart.definition.xdef.sortOrder ?? 'ascKeys';
if (sortOrder != 'natural') {
if (sortOrder == 'ascKeys' || sortOrder == 'descKeys') {
if (chart.definition.chartType == 'line' && chart.definition.xdef.transformFunction.startsWith('date:')) {
addedChart = autoAggregateCompactTimelineChart(addedChart);
fillChartTimelineBuckets(addedChart);
}
if (addedChart.errorMessage) {
this.charts.push(addedChart);
continue;
}
addedChart.bucketKeysOrdered = _sortBy(Object.keys(addedChart.buckets));
if (sortOrder == 'descKeys') {
addedChart.bucketKeysOrdered.reverse();
}
}
if (sortOrder == 'ascValues' || sortOrder == 'descValues') {
addedChart.bucketKeysOrdered = _sortBy(Object.keys(addedChart.buckets), key =>
computeChartBucketCardinality(addedChart.buckets[key])
);
if (sortOrder == 'descValues') {
addedChart.bucketKeysOrdered.reverse();
}
}
}
if (!addedChart.isGivenDefinition) {
addedChart = {
...addedChart,
definition: {
...addedChart.definition,
ydefs: addedChart.definition.ydefs.filter(
y =>
!addedChart.invalidYRows[y.field] &&
addedChart.validYRows[y.field] / addedChart.rowsAdded >= ChartLimits.VALID_VALUE_RATIO_LIMIT
),
},
};
}
if (addedChart) {
addedChart.availableColumns = this.availableColumns;
this.charts.push(addedChart);
}
this.groupPieOtherBuckets(addedChart);
}
this.charts = [
...this.charts.filter(x => x.isGivenDefinition),
..._sortBy(
this.charts.filter(x => !x.isGivenDefinition),
chart => -getChartScore(chart)
),
];
}
groupPieOtherBuckets(chart: ProcessedChart) {
if (chart.definition.chartType !== 'pie') {
return; // only for pie charts
}
const ratioLimit = chart.definition.pieRatioLimit ?? ChartLimits.PIE_RATIO_LIMIT;
const countLimit = chart.definition.pieCountLimit ?? ChartLimits.PIE_COUNT_LIMIT;
if (ratioLimit == 0 && countLimit == 0) {
return; // no grouping if limit is 0
}
const otherBucket: any = {};
let newBuckets: any = {};
const cardSum = _sum(Object.values(chart.buckets).map(bucket => computeChartBucketCardinality(bucket)));
if (cardSum == 0) {
return; // no buckets to process
}
for (const [bucketKey, bucket] of Object.entries(chart.buckets)) {
if (computeChartBucketCardinality(bucket) / cardSum < ratioLimit) {
for (const field in bucket) {
otherBucket[field] = (otherBucket[field] ?? 0) + bucket[field];
}
} else {
newBuckets[bucketKey] = bucket;
}
}
if (Object.keys(newBuckets).length > countLimit) {
const sortedBucketKeys = _sortBy(
Object.entries(newBuckets),
([, bucket]) => -computeChartBucketCardinality(bucket)
).map(([key]) => key);
const newBuckets2 = {};
sortedBucketKeys.forEach((key, index) => {
if (index < countLimit) {
newBuckets2[key] = newBuckets[key];
} else {
for (const field in newBuckets[key]) {
otherBucket[field] = (otherBucket[field] ?? 0) + newBuckets[key][field];
}
}
});
newBuckets = newBuckets2;
}
if (Object.keys(otherBucket).length > 0) {
newBuckets['Other'] = otherBucket;
}
chart.buckets = newBuckets;
chart.bucketKeysOrdered = [...chart.bucketKeysOrdered, 'Other'].filter(key => key in newBuckets);
}
applyRawData(
chart: ProcessedChart,
row: any,
dateParsed: ChartDateParsed,
numericColumns: { [key: string]: number },
stringColumns: { [key: string]: string }
) {
if (chart.definition.xdef == null) {
return;
}
if (row[chart.definition.xdef.field] == null) {
return;
}
if (dateParsed == null && chart.definition.xdef.transformFunction.startsWith('date:')) {
chart.invalidXRows += 1;
return; // skip if date is invalid
}
const [bucketKey, bucketKeyParsed] = computeChartBucketKey(dateParsed, chart, row);
if (!bucketKey) {
return; // skip if no bucket key
}
if (bucketKeyParsed) {
chart.bucketKeyDateParsed[bucketKey] = bucketKeyParsed;
}
if (chart.minX == null || bucketKey < chart.minX) {
chart.minX = bucketKey;
}
if (chart.maxX == null || bucketKey > chart.maxX) {
chart.maxX = bucketKey;
}
if (!chart.buckets[bucketKey]) {
chart.buckets[bucketKey] = {};
if (chart.definition.xdef.sortOrder == 'natural') {
chart.bucketKeysOrdered.push(bucketKey);
}
}
aggregateChartNumericValuesFromSource(chart, bucketKey, numericColumns, row);
chart.rowsAdded += 1;
}
}