split query - own implementation

This commit is contained in:
Jan Prochazka
2021-05-31 21:56:56 +02:00
parent e480e08e0e
commit fe055d4b70
5 changed files with 430 additions and 204 deletions

View File

@@ -0,0 +1,22 @@
import { SplitterOptions } from './options';
export interface State {
options: SplitterOptions;
start: number;
end: number;
position: number;
input: string;
// unread: string;
// currentDelimiter: string;
// currentStatement: string;
// output: string[];
semicolonKeyTokenRegex: RegExp;
}
export interface Token {
type: string;
value: string;
start: number;
end: number;
}

View File

@@ -1,6 +1,8 @@
export interface SplitterOptions { export interface SplitterOptions {
allowBacktickString: boolean; stringsBegins: string[];
allowIndexParenString: boolean; stringsEnds: { string: string };
stringEscapes: string[];
allowSemicolon: boolean; allowSemicolon: boolean;
allowCustomDelimiter: boolean; allowCustomDelimiter: boolean;
allowGoDelimiter: boolean; allowGoDelimiter: boolean;
@@ -8,28 +10,39 @@ export interface SplitterOptions {
} }
export const defaultSplitterOptions = { export const defaultSplitterOptions = {
allowBacktickString: false, stringsBegins: ["'"],
stringsEnds: { "'": "'" },
stringEscapes: { "'": "'" },
allowSemicolon: true, allowSemicolon: true,
allowCustomDelimiter: false, allowCustomDelimiter: false,
allowGoDelimiter: false, allowGoDelimiter: false,
allowDollarDollarString: false,
allowIndexParenString: false,
}; };
export const mysqlSplitterOptions = { export const mysqlSplitterOptions = {
...defaultSplitterOptions, ...defaultSplitterOptions,
allowCustomDelimiter: true,
allowBacktickString: true, stringsBegins: ["'", '`'],
stringsEnds: { "'": "'", '`': '`' },
stringEscapes: { "'": '\\', '`': '`' },
}; };
export const mssqlSplitterOptions = { export const mssqlSplitterOptions = {
...defaultSplitterOptions, ...defaultSplitterOptions,
allowSemicolon: false, allowSemicolon: false,
allowGoDelimiter: true, allowGoDelimiter: true,
allowIndexParenString: true,
stringsBegins: ["'", '['],
stringsEnds: { "'": "'", '[': ']' },
stringEscapes: { "'": "'" },
}; };
export const postgreSplitterOptions = { export const postgreSplitterOptions = {
...defaultSplitterOptions, ...defaultSplitterOptions,
allowDollarDollarString: true, allowDollarDollarString: true,
stringsBegins: ["'", '"'],
stringsEnds: { "'": "'", '"': '"' },
stringEscapes: { "'": "'", '"': '"' },
}; };

View File

@@ -0,0 +1,226 @@
import { SplitterOptions, defaultSplitterOptions } from './options';
const SINGLE_QUOTE = "'";
const DOUBLE_QUOTE = '"';
const BACKTICK = '`';
const DOUBLE_DASH_COMMENT_START = '--';
const HASH_COMMENT_START = '#';
const C_STYLE_COMMENT_START = '/*';
const SEMICOLON = ';';
const LINE_FEED = '\n';
const DELIMITER_KEYWORD = 'DELIMITER';
interface SplitExecutionContext {
options: SplitterOptions;
unread: string;
currentDelimiter: string;
currentStatement: string;
output: string[];
semicolonKeyTokenRegex: RegExp;
}
interface FindExpResult {
expIndex: number;
exp: string | null;
nextIndex: number;
}
const regexEscapeSetRegex = /[-/\\^$*+?.()|[\]{}]/g;
const singleQuoteStringEndRegex = /(?<!\\)'/;
const doubleQuoteStringEndRegex = /(?<!\\)"/;
const backtickQuoteEndRegex = /(?<!`)`(?!`)/;
const doubleDashCommentStartRegex = /--[ \f\n\r\t\v]/;
const cStyleCommentStartRegex = /\/\*/;
const cStyleCommentEndRegex = /(?<!\/)\*\//;
const newLineRegex = /(?:[\r\n]+|$)/;
const delimiterStartRegex = /(?:^|[\n\r]+)[ \f\t\v]*DELIMITER[ \t]+/i;
// Best effort only, unable to find a syntax specification on delimiter
const delimiterTokenRegex = /^(?:'(.+)'|"(.+)"|`(.+)`|([^\s]+))/;
// const semicolonKeyTokenRegex = buildKeyTokenRegex(SEMICOLON);
const quoteEndRegexDict: Record<string, RegExp> = {
[SINGLE_QUOTE]: singleQuoteStringEndRegex,
[DOUBLE_QUOTE]: doubleQuoteStringEndRegex,
[BACKTICK]: backtickQuoteEndRegex,
};
function escapeRegex(value: string): string {
return value.replace(regexEscapeSetRegex, '\\$&');
}
function buildKeyTokenRegex(delimiter: string, context: SplitExecutionContext): RegExp {
const { options } = context;
return new RegExp(
'(?:' +
[
escapeRegex(delimiter),
SINGLE_QUOTE,
DOUBLE_QUOTE,
options.allowBacktickString ? BACKTICK : undefined,
doubleDashCommentStartRegex.source,
HASH_COMMENT_START,
cStyleCommentStartRegex.source,
options.allowCustomDelimiter ? delimiterStartRegex.source : undefined,
]
.filter(x => x !== undefined)
.join('|') +
')',
'i'
);
}
function findExp(content: string, regex: RegExp): FindExpResult {
const match = content.match(regex);
let result: FindExpResult;
if (match?.index !== undefined) {
result = {
expIndex: match.index,
exp: match[0],
nextIndex: match.index + match[0].length,
};
} else {
result = {
expIndex: -1,
exp: null,
nextIndex: content.length,
};
}
return result;
}
function findKeyToken(content: string, currentDelimiter: string, context: SplitExecutionContext): FindExpResult {
let regex;
if (currentDelimiter === SEMICOLON) {
regex = context.semicolonKeyTokenRegex;
} else {
regex = buildKeyTokenRegex(currentDelimiter, context);
}
return findExp(content, regex);
}
function findEndQuote(content: string, quote: string): FindExpResult {
if (!(quote in quoteEndRegexDict)) {
throw new TypeError(`Incorrect quote ${quote} supplied`);
}
return findExp(content, quoteEndRegexDict[quote]);
}
function read(context: SplitExecutionContext, readToIndex: number, nextUnreadIndex?: number): void {
const readContent = context.unread.slice(0, readToIndex);
context.currentStatement += readContent;
if (nextUnreadIndex !== undefined && nextUnreadIndex > 0) {
context.unread = context.unread.slice(nextUnreadIndex);
} else {
context.unread = context.unread.slice(readToIndex);
}
}
function readTillNewLine(context: SplitExecutionContext): void {
const findResult = findExp(context.unread, newLineRegex);
read(context, findResult.expIndex, findResult.expIndex);
}
function discard(context: SplitExecutionContext, nextUnreadIndex: number): void {
if (nextUnreadIndex > 0) {
context.unread = context.unread.slice(nextUnreadIndex);
}
}
function discardTillNewLine(context: SplitExecutionContext): void {
const findResult = findExp(context.unread, newLineRegex);
discard(context, findResult.expIndex);
}
function publishStatement(context: SplitExecutionContext): void {
const trimmed = context.currentStatement.trim();
if (trimmed) {
context.output.push(trimmed);
}
context.currentStatement = '';
}
function handleKeyTokenFindResult(context: SplitExecutionContext, findResult: FindExpResult): void {
switch (findResult.exp?.trim()) {
case context.currentDelimiter:
read(context, findResult.expIndex, findResult.nextIndex);
publishStatement(context);
break;
case SINGLE_QUOTE:
case DOUBLE_QUOTE:
case BACKTICK: {
read(context, findResult.nextIndex);
const findQuoteResult = findEndQuote(context.unread, findResult.exp);
read(context, findQuoteResult.nextIndex, undefined);
break;
}
case DOUBLE_DASH_COMMENT_START: {
read(context, findResult.nextIndex);
readTillNewLine(context);
break;
}
case HASH_COMMENT_START: {
read(context, findResult.nextIndex);
readTillNewLine(context);
break;
}
case C_STYLE_COMMENT_START: {
read(context, findResult.nextIndex);
const findCommentResult = findExp(context.unread, cStyleCommentEndRegex);
read(context, findCommentResult.nextIndex);
break;
}
case DELIMITER_KEYWORD: {
read(context, findResult.expIndex, findResult.nextIndex);
// MySQL client will return `DELIMITER cannot contain a backslash character` if backslash is used
// Shall we reject backslash as well?
const matched = context.unread.match(delimiterTokenRegex);
if (matched?.index !== undefined) {
context.currentDelimiter = matched[0].trim();
discard(context, matched[0].length);
}
discardTillNewLine(context);
break;
}
case undefined:
case null:
read(context, findResult.nextIndex);
publishStatement(context);
break;
default:
// This should never happen
throw new Error(`Unknown token '${findResult.exp ?? '(null)'}'`);
}
}
export function splitQuery(sql: string, options: SplitterOptions = null): string[] {
const context: SplitExecutionContext = {
unread: sql,
currentDelimiter: SEMICOLON,
currentStatement: '',
output: [],
semicolonKeyTokenRegex: null,
options: {
...defaultSplitterOptions,
...options,
},
};
context.semicolonKeyTokenRegex = buildKeyTokenRegex(SEMICOLON, context);
let findResult: FindExpResult = {
expIndex: -1,
exp: null,
nextIndex: 0,
};
let lastUnreadLength;
do {
// console.log('context.unread', context.unread);
lastUnreadLength = context.unread.length;
findResult = findKeyToken(context.unread, context.currentDelimiter, context);
handleKeyTokenFindResult(context, findResult);
// Prevent infinite loop by returning incorrect result
if (lastUnreadLength === context.unread.length) {
read(context, context.unread.length);
}
} while (context.unread !== '');
publishStatement(context);
// console.log('RESULT', context.output);
return context.output;
}

View File

@@ -1,226 +1,182 @@
import { SplitterOptions, defaultSplitterOptions } from './options'; import { SplitterOptions, defaultSplitterOptions } from './options';
const SINGLE_QUOTE = "'";
const DOUBLE_QUOTE = '"';
const BACKTICK = '`';
const DOUBLE_DASH_COMMENT_START = '--';
const HASH_COMMENT_START = '#';
const C_STYLE_COMMENT_START = '/*';
const SEMICOLON = ';'; const SEMICOLON = ';';
const LINE_FEED = '\n';
const DELIMITER_KEYWORD = 'DELIMITER';
interface SplitExecutionContext { interface SplitExecutionContext {
options: SplitterOptions; options: SplitterOptions;
unread: string; source: string;
position: number;
currentDelimiter: string; currentDelimiter: string;
currentStatement: string;
output: string[]; output: string[];
semicolonKeyTokenRegex: RegExp; end: number;
wasDataOnLine: boolean;
currentCommandStart: number;
// unread: string;
// currentStatement: string;
// semicolonKeyTokenRegex: RegExp;
} }
interface FindExpResult { function isStringEnd(s: string, pos: number, endch: string, escapech: string) {
expIndex: number; if (!escapech) {
exp: string | null; return s[pos] == endch;
nextIndex: number; }
if (endch == escapech) {
return s[pos] == endch && s[pos + 1] != endch;
} else {
return s[pos] == endch && s[pos - 1] != escapech;
}
} }
const regexEscapeSetRegex = /[-/\\^$*+?.()|[\]{}]/g; interface Token {
const singleQuoteStringEndRegex = /(?<!\\)'/; type: 'string' | 'delimiter' | 'whitespace' | 'eoln' | 'data' | 'set_delimiter';
const doubleQuoteStringEndRegex = /(?<!\\)"/; length: number;
const backtickQuoteEndRegex = /(?<!`)`(?!`)/; value?: string;
const doubleDashCommentStartRegex = /--[ \f\n\r\t\v]/; }
const cStyleCommentStartRegex = /\/\*/;
const cStyleCommentEndRegex = /(?<!\/)\*\//; const WHITESPACE_TOKEN: Token = {
const newLineRegex = /(?:[\r\n]+|$)/; type: 'whitespace',
const delimiterStartRegex = /(?:^|[\n\r]+)[ \f\t\v]*DELIMITER[ \t]+/i; length: 1,
// Best effort only, unable to find a syntax specification on delimiter };
const delimiterTokenRegex = /^(?:'(.+)'|"(.+)"|`(.+)`|([^\s]+))/; const EOLN_TOKEN: Token = {
// const semicolonKeyTokenRegex = buildKeyTokenRegex(SEMICOLON); type: 'eoln',
const quoteEndRegexDict: Record<string, RegExp> = { length: 1,
[SINGLE_QUOTE]: singleQuoteStringEndRegex, };
[DOUBLE_QUOTE]: doubleQuoteStringEndRegex, const DATA_TOKEN: Token = {
[BACKTICK]: backtickQuoteEndRegex, type: 'data',
length: 1,
}; };
function escapeRegex(value: string): string { function scanToken(context: SplitExecutionContext): Token {
return value.replace(regexEscapeSetRegex, '\\$&'); let pos = context.position;
} const s = context.source;
const ch = s[pos];
function buildKeyTokenRegex(delimiter: string, context: SplitExecutionContext): RegExp { if (context.options.stringsBegins.includes(ch)) {
const { options } = context; pos++;
return new RegExp( const endch = context.options.stringsEnds[ch];
'(?:' + const escapech = context.options.stringEscapes[ch];
[ while (pos < context.end && !isStringEnd(s, pos, endch, escapech)) {
escapeRegex(delimiter), if (endch == escapech && s[pos] == endch && s[pos + 1] == endch) {
SINGLE_QUOTE, pos += 2;
DOUBLE_QUOTE, } else {
options.allowBacktickString ? BACKTICK : undefined, pos++;
doubleDashCommentStartRegex.source,
HASH_COMMENT_START,
cStyleCommentStartRegex.source,
options.allowCustomDelimiter ? delimiterStartRegex.source : undefined,
]
.filter(x => x !== undefined)
.join('|') +
')',
'i'
);
}
function findExp(content: string, regex: RegExp): FindExpResult {
const match = content.match(regex);
let result: FindExpResult;
if (match?.index !== undefined) {
result = {
expIndex: match.index,
exp: match[0],
nextIndex: match.index + match[0].length,
};
} else {
result = {
expIndex: -1,
exp: null,
nextIndex: content.length,
};
}
return result;
}
function findKeyToken(content: string, currentDelimiter: string, context: SplitExecutionContext): FindExpResult {
let regex;
if (currentDelimiter === SEMICOLON) {
regex = context.semicolonKeyTokenRegex;
} else {
regex = buildKeyTokenRegex(currentDelimiter, context);
}
return findExp(content, regex);
}
function findEndQuote(content: string, quote: string): FindExpResult {
if (!(quote in quoteEndRegexDict)) {
throw new TypeError(`Incorrect quote ${quote} supplied`);
}
return findExp(content, quoteEndRegexDict[quote]);
}
function read(context: SplitExecutionContext, readToIndex: number, nextUnreadIndex?: number): void {
const readContent = context.unread.slice(0, readToIndex);
context.currentStatement += readContent;
if (nextUnreadIndex !== undefined && nextUnreadIndex > 0) {
context.unread = context.unread.slice(nextUnreadIndex);
} else {
context.unread = context.unread.slice(readToIndex);
}
}
function readTillNewLine(context: SplitExecutionContext): void {
const findResult = findExp(context.unread, newLineRegex);
read(context, findResult.expIndex, findResult.expIndex);
}
function discard(context: SplitExecutionContext, nextUnreadIndex: number): void {
if (nextUnreadIndex > 0) {
context.unread = context.unread.slice(nextUnreadIndex);
}
}
function discardTillNewLine(context: SplitExecutionContext): void {
const findResult = findExp(context.unread, newLineRegex);
discard(context, findResult.expIndex);
}
function publishStatement(context: SplitExecutionContext): void {
const trimmed = context.currentStatement.trim();
if (trimmed) {
context.output.push(trimmed);
}
context.currentStatement = '';
}
function handleKeyTokenFindResult(context: SplitExecutionContext, findResult: FindExpResult): void {
switch (findResult.exp?.trim()) {
case context.currentDelimiter:
read(context, findResult.expIndex, findResult.nextIndex);
publishStatement(context);
break;
case SINGLE_QUOTE:
case DOUBLE_QUOTE:
case BACKTICK: {
read(context, findResult.nextIndex);
const findQuoteResult = findEndQuote(context.unread, findResult.exp);
read(context, findQuoteResult.nextIndex, undefined);
break;
}
case DOUBLE_DASH_COMMENT_START: {
read(context, findResult.nextIndex);
readTillNewLine(context);
break;
}
case HASH_COMMENT_START: {
read(context, findResult.nextIndex);
readTillNewLine(context);
break;
}
case C_STYLE_COMMENT_START: {
read(context, findResult.nextIndex);
const findCommentResult = findExp(context.unread, cStyleCommentEndRegex);
read(context, findCommentResult.nextIndex);
break;
}
case DELIMITER_KEYWORD: {
read(context, findResult.expIndex, findResult.nextIndex);
// MySQL client will return `DELIMITER cannot contain a backslash character` if backslash is used
// Shall we reject backslash as well?
const matched = context.unread.match(delimiterTokenRegex);
if (matched?.index !== undefined) {
context.currentDelimiter = matched[0].trim();
discard(context, matched[0].length);
} }
discardTillNewLine(context);
break;
} }
case undefined: return {
case null: type: 'string',
read(context, findResult.nextIndex); length: pos - context.position + 1,
publishStatement(context); };
break;
default:
// This should never happen
throw new Error(`Unknown token '${findResult.exp ?? '(null)'}'`);
} }
if (context.currentDelimiter && s.slice(pos).startsWith(context.currentDelimiter)) {
return {
type: 'delimiter',
length: context.currentDelimiter.length,
};
}
if (ch == ' ' || ch == '\t' || ch == '\r') {
return WHITESPACE_TOKEN;
}
if (ch == '\n') {
return EOLN_TOKEN;
}
if (context.options.allowCustomDelimiter && !context.wasDataOnLine) {
const m = s.slice(pos).match(/^DELIMITER[ \t]+([^\n]+)/i);
if (m) {
return {
type: 'set_delimiter',
value: m[1].trim(),
length: m[0].length,
};
}
}
return DATA_TOKEN;
}
function pushQuery(context) {
const sql = context.source.slice(context.currentCommandStart, context.position);
const trimmed = sql.trim();
if (trimmed) context.output.push(trimmed);
} }
export function splitQuery(sql: string, options: SplitterOptions = null): string[] { export function splitQuery(sql: string, options: SplitterOptions = null): string[] {
const context: SplitExecutionContext = { const context: SplitExecutionContext = {
unread: sql, source: sql,
currentDelimiter: SEMICOLON, end: sql.length,
currentStatement: '', currentDelimiter: options?.allowSemicolon === false ? null : SEMICOLON,
position: 0,
currentCommandStart: 0,
output: [], output: [],
semicolonKeyTokenRegex: null, wasDataOnLine: false,
options: { options: {
...defaultSplitterOptions, ...defaultSplitterOptions,
...options, ...options,
}, },
}; };
context.semicolonKeyTokenRegex = buildKeyTokenRegex(SEMICOLON, context);
let findResult: FindExpResult = { while (context.position < context.end) {
expIndex: -1, const token = scanToken(context);
exp: null, if (!token) {
nextIndex: 0, // nothing special, move forward
}; context.position += 1;
let lastUnreadLength; continue;
do {
// console.log('context.unread', context.unread);
lastUnreadLength = context.unread.length;
findResult = findKeyToken(context.unread, context.currentDelimiter, context);
handleKeyTokenFindResult(context, findResult);
// Prevent infinite loop by returning incorrect result
if (lastUnreadLength === context.unread.length) {
read(context, context.unread.length);
} }
} while (context.unread !== ''); switch (token.type) {
publishStatement(context); case 'string':
context.position += token.length;
context.wasDataOnLine = true;
break;
case 'eoln':
context.position += token.length;
context.wasDataOnLine = false;
break;
case 'data':
context.position += token.length;
context.wasDataOnLine = true;
break;
case 'whitespace':
context.position += token.length;
break;
case 'set_delimiter':
context.currentDelimiter = token.value;
context.position += token.length;
break;
case 'delimiter':
pushQuery(context);
context.position += token.length;
context.currentCommandStart = context.position;
break;
}
}
if (context.end > context.currentCommandStart) {
pushQuery(context);
}
// context.semicolonKeyTokenRegex = buildKeyTokenRegex(SEMICOLON, context);
// let findResult: FindExpResult = {
// expIndex: -1,
// exp: null,
// nextIndex: 0,
// };
// let lastUnreadLength;
// do {
// // console.log('context.unread', context.unread);
// lastUnreadLength = context.unread.length;
// findResult = findKeyToken(context.unread, context.currentDelimiter, context);
// handleKeyTokenFindResult(context, findResult);
// // Prevent infinite loop by returning incorrect result
// if (lastUnreadLength === context.unread.length) {
// read(context, context.unread.length);
// }
// } while (context.unread !== '');
// publishStatement(context);
// console.log('RESULT', context.output); // console.log('RESULT', context.output);
return context.output; return context.output;
} }

View File

@@ -1,4 +1,4 @@
import { mysqlSplitterOptions } from './options'; import { mysqlSplitterOptions, mssqlSplitterOptions } from './options';
import { splitQuery } from './splitQuery'; import { splitQuery } from './splitQuery';
test('simple query', () => { test('simple query', () => {
@@ -28,7 +28,16 @@ test('should handle double backtick', () => {
}); });
test('semicolon inside string', () => { test('semicolon inside string', () => {
const input = ['CREATE TABLE [a;1]', "INSERT INTO [a;1] (x) VALUES ('1;2;3;4')"]; const input = ['CREATE TABLE a', "INSERT INTO a (x) VALUES ('1;2;3;4')"];
const output = splitQuery(input.join(';\n') + ';', mysqlSplitterOptions); const output = splitQuery(input.join(';\n') + ';', mysqlSplitterOptions);
expect(output).toEqual(input); expect(output).toEqual(input);
}); });
test('semicolon inside identyifier - mssql', () => {
const input = ['CREATE TABLE [a;1]', "INSERT INTO [a;1] (x) VALUES ('1')"];
const output = splitQuery(input.join(';\n') + ';', {
...mssqlSplitterOptions,
allowSemicolon: true,
});
expect(output).toEqual(input);
});