134 lines
3.2 KiB
JavaScript
Raw Normal View History

2020-05-17 19:43:37 +02:00
function lexer(str) {
2020-05-22 09:38:30 +02:00
str = str.toLowerCase();
2020-05-17 19:43:37 +02:00
const fulltextTokens = [];
const expressionTokens = [];
let quotes = false;
let fulltextEnded = false;
let currentWord = '';
2020-05-17 23:14:24 +02:00
function isOperatorSymbol(chr) {
2020-05-17 19:43:37 +02:00
return ['=', '*', '>', '<', '!'].includes(chr);
}
2020-06-03 16:24:41 +02:00
function previousOperatorSymbol() {
2020-05-17 23:14:24 +02:00
if (currentWord.length === 0) {
return false;
}
else {
return isOperatorSymbol(currentWord[currentWord.length - 1]);
}
}
2020-05-17 19:43:37 +02:00
function finishWord() {
if (currentWord === '') {
return;
}
2020-07-19 23:19:45 +02:00
const rec = {
token: currentWord,
2020-07-19 23:23:48 +02:00
inQuotes: !!quotes
2020-07-19 23:19:45 +02:00
};
2020-05-17 19:43:37 +02:00
if (fulltextEnded) {
2020-07-19 23:19:45 +02:00
expressionTokens.push(rec);
2020-05-17 19:43:37 +02:00
} else {
2020-07-19 23:19:45 +02:00
fulltextTokens.push(rec);
2020-05-17 19:43:37 +02:00
}
currentWord = '';
}
for (let i = 0; i < str.length; i++) {
const chr = str[i];
if (chr === '\\') {
if ((i + 1) < str.length) {
i++;
currentWord += str[i];
}
else {
currentWord += chr;
}
continue;
}
else if (['"', "'", '`'].includes(chr)) {
if (!quotes) {
2020-05-17 23:14:24 +02:00
if (currentWord.length === 0 || fulltextEnded) {
2020-06-03 16:24:41 +02:00
if (previousOperatorSymbol()) {
2020-05-17 23:14:24 +02:00
finishWord();
}
2020-05-17 19:43:37 +02:00
quotes = chr;
}
else {
// quote inside a word does not have special meening and does not break word
// e.g. d'Artagnan is kept as a single token
currentWord += chr;
}
}
else if (quotes === chr) {
finishWord();
2020-07-19 23:23:48 +02:00
quotes = false;
2020-05-17 19:43:37 +02:00
}
else {
// it's a quote but within other kind of quotes so it's valid as a literal character
currentWord += chr;
}
continue;
}
else if (!quotes) {
2020-07-19 23:19:45 +02:00
if (chr === '#' || chr === '~') {
if (!fulltextEnded) {
fulltextEnded = true;
}
else {
finishWord();
}
2020-05-17 23:14:24 +02:00
currentWord = chr;
2020-05-17 19:43:37 +02:00
continue;
}
2020-07-19 15:25:24 +02:00
else if (['#', '~'].includes(currentWord) && chr === '!') {
currentWord += chr;
continue;
}
2020-05-17 19:43:37 +02:00
else if (chr === ' ') {
finishWord();
continue;
}
else if (fulltextEnded && ['(', ')', '.'].includes(chr)) {
2020-05-17 23:14:24 +02:00
finishWord();
currentWord += chr;
finishWord();
continue;
}
2020-07-19 15:25:24 +02:00
else if (fulltextEnded
&& !['#!', '~!'].includes(currentWord)
&& previousOperatorSymbol() !== isOperatorSymbol(chr)) {
2020-05-17 19:43:37 +02:00
finishWord();
currentWord += chr;
continue;
}
}
currentWord += chr;
}
finishWord();
return {
fulltextTokens,
expressionTokens
}
}
module.exports = lexer;