<?php
declare(strict_types=1);
namespace PhpMyAdmin\SqlParser\Utils;
use PhpMyAdmin\SqlParser\Components\JoinKeyword;
use PhpMyAdmin\SqlParser\Lexer;
use PhpMyAdmin\SqlParser\Parser;
use PhpMyAdmin\SqlParser\Token;
use PhpMyAdmin\SqlParser\TokensList;
use function array_merge;
use function array_pop;
use function end;
use function htmlspecialchars;
use function in_array;
use function mb_strlen;
use function str_contains;
use function str_repeat;
use function str_replace;
use function strtoupper;
use const ENT_NOQUOTES;
use const PHP_SAPI;
/**
* Utilities that are used for formatting queries.
*/
class Formatter
{
/**
* The formatting options.
*
* @var array<string, bool|string|array<int, array<string, int|string>>>
*/
public $options;
/**
* Clauses that are usually short.
*
* These clauses share the line with the next clause.
*
* E.g. if INSERT was not here, the formatter would produce:
*
* INSERT
* INTO foo
* VALUES(0, 0, 0),(1, 1, 1);
*
* Instead of:
*
* INSERT INTO foo
* VALUES(0, 0, 0),(1, 1, 1)
*
* @var array<string, bool>
*/
public static $SHORT_CLAUSES = [
'CREATE' => true,
'INSERT' => true,
];
/**
* Clauses that must be inlined.
*
* These clauses usually are short and it's nicer to have them inline.
*
* @var array<string, bool>
*/
public static $INLINE_CLAUSES = [
'CREATE' => true,
'INTO' => true,
'LIMIT' => true,
'PARTITION BY' => true,
'PARTITION' => true,
'PROCEDURE' => true,
'SUBPARTITION BY' => true,
'VALUES' => true,
];
/**
* @param array<string, bool|string|array<int, array<string, int|string>>> $options the formatting options
*/
public function __construct(array $options = [])
{
$this->options = $this->getMergedOptions($options);
}
/**
* The specified formatting options are merged with the default values.
*
* @param array<string, bool|string|array<int, array<string, int|string>>> $options
*
* @return array<string, bool|string|array<int, array<string, int|string>>>
*/
protected function getMergedOptions(array $options)
{
$options = array_merge(
$this->getDefaultOptions(),
$options
);
if (isset($options['formats'])) {
$options['formats'] = self::mergeFormats($this->getDefaultFormats(), $options['formats']);
} else {
$options['formats'] = $this->getDefaultFormats();
}
if ($options['line_ending'] === null) {
$options['line_ending'] = $options['type'] === 'html' ? '<br/>' : "\n";
}
if ($options['indentation'] === null) {
$options['indentation'] = $options['type'] === 'html' ? ' ' : ' ';
}
// `parts_newline` requires `clause_newline`
$options['parts_newline'] &= $options['clause_newline'];
return $options;
}
/**
* The default formatting options.
*
* @return array<string, bool|string|null>
* @psalm-return array{
* type: ('cli'|'text'),
* line_ending: null,
* indentation: null,
* remove_comments: false,
* clause_newline: true,
* parts_newline: true,
* indent_parts: true
* }
*/
protected function getDefaultOptions()
{
return [
/*
* The format of the result.
*
* @var string The type ('text', 'cli' or 'html')
*/
'type' => PHP_SAPI === 'cli' ? 'cli' : 'text',
/*
* The line ending used.
* By default, for text this is "\n" and for HTML this is "<br/>".
*
* @var string
*/
'line_ending' => null,
/*
* The string used for indentation.
*
* @var string
*/
'indentation' => null,
/*
* Whether comments should be removed or not.
*
* @var bool
*/
'remove_comments' => false,
/*
* Whether each clause should be on a new line.
*
* @var bool
*/
'clause_newline' => true,
/*
* Whether each part should be on a new line.
* Parts are delimited by brackets and commas.
*
* @var bool
*/
'parts_newline' => true,
/*
* Whether each part of each clause should be indented.
*
* @var bool
*/
'indent_parts' => true,
];
}
/**
* The styles used for HTML formatting.
* [$type, $flags, $span, $callback].
*
* @return array<int, array<string, int|string>>
* @psalm-return list<array{type: int, flags: int, html: string, cli: string, function: string}>
*/
protected function getDefaultFormats()
{
return [
[
'type' => Token::TYPE_KEYWORD,
'flags' => Token::FLAG_KEYWORD_RESERVED,
'html' => 'class="sql-reserved"',
'cli' => "\x1b[35m",
'function' => 'strtoupper',
],
[
'type' => Token::TYPE_KEYWORD,
'flags' => 0,
'html' => 'class="sql-keyword"',
'cli' => "\x1b[95m",
'function' => 'strtoupper',
],
[
'type' => Token::TYPE_COMMENT,
'flags' => 0,
'html' => 'class="sql-comment"',
'cli' => "\x1b[37m",
'function' => '',
],
[
'type' => Token::TYPE_BOOL,
'flags' => 0,
'html' => 'class="sql-atom"',
'cli' => "\x1b[36m",
'function' => 'strtoupper',
],
[
'type' => Token::TYPE_NUMBER,
'flags' => 0,
'html' => 'class="sql-number"',
'cli' => "\x1b[92m",
'function' => 'strtolower',
],
[
'type' => Token::TYPE_STRING,
'flags' => 0,
'html' => 'class="sql-string"',
'cli' => "\x1b[91m",
'function' => '',
],
[
'type' => Token::TYPE_SYMBOL,
'flags' => Token::FLAG_SYMBOL_PARAMETER,
'html' => 'class="sql-parameter"',
'cli' => "\x1b[31m",
'function' => '',
],
[
'type' => Token::TYPE_SYMBOL,
'flags' => 0,
'html' => 'class="sql-variable"',
'cli' => "\x1b[36m",
'function' => '',
],
];
}
/**
* @param array<int, array<string, int|string>> $formats
* @param array<int, array<string, int|string>> $newFormats
*
* @return array<int, array<string, int|string>>
*/
private static function mergeFormats(array $formats, array $newFormats): array
{
$added = [];
$integers = [
'flags',
'type',
];
$strings = [
'html',
'cli',
'function',
];
/* Sanitize the array so that we do not have to care later */
foreach ($newFormats as $j => $new) {
foreach ($integers as $name) {
if (isset($new[$name])) {
continue;
}
$newFormats[$j][$name] = 0;
}
foreach ($strings as $name) {
if (isset($new[$name])) {
continue;
}
$newFormats[$j][$name] = '';
}
}
/* Process changes to existing formats */
foreach ($formats as $i => $original) {
foreach ($newFormats as $j => $new) {
if ($new['type'] !== $original['type'] || $original['flags'] !== $new['flags']) {
continue;
}
$formats[$i] = $new;
$added[] = $j;
}
}
/* Add not already handled formats */
foreach ($newFormats as $j => $new) {
if (in_array($j, $added)) {
continue;
}
$formats[] = $new;
}
return $formats;
}
/**
* Formats the given list of tokens.
*
* @param TokensList $list the list of tokens
*
* @return string
*/
public function formatList($list)
{
/**
* The query to be returned.
*
* @var string
*/
$ret = '';
/**
* The indentation level.
*
* @var int
*/
$indent = 0;
/**
* Whether the line ended.
*
* @var bool
*/
$lineEnded = false;
/**
* Whether current group is short (no linebreaks).
*
* @var bool
*/
$shortGroup = false;
/**
* The name of the last clause.
*
* @var string
*/
$lastClause = '';
/**
* A stack that keeps track of the indentation level every time a new
* block is found.
*/
$blocksIndentation = [];
/**
* A stack that keeps track of the line endings every time a new block
* is found.
*/
$blocksLineEndings = [];
/**
* Whether clause's options were formatted.
*
* @var bool
*/
$formattedOptions = false;
/**
* Previously parsed token.
*
* @var Token|null
*/
$prev = null;
// In order to be able to format the queries correctly, the next token
// must be taken into consideration. The loop below uses two pointers,
// `$prev` and `$curr` which store two consecutive tokens.
// Actually, at every iteration the previous token is being used.
for ($list->idx = 0; $list->idx < $list->count; ++$list->idx) {
/**
* Token parsed at this moment.
*/
$curr = $list->tokens[$list->idx];
if ($list->idx + 1 < $list->count) {
$next = $list->tokens[$list->idx + 1];
} else {
$next = null;
}
if ($curr->type === Token::TYPE_WHITESPACE) {
// Keep linebreaks before and after comments
if (
str_contains($curr->token, "\n") && (
($prev !== null && $prev->type === Token::TYPE_COMMENT) ||
($next !== null && $next->type === Token::TYPE_COMMENT)
)
) {
$lineEnded = true;
}
// Whitespaces are skipped because the formatter adds its own.
continue;
}
if ($curr->type === Token::TYPE_COMMENT && $this->options['remove_comments']) {
// Skip Comments if option `remove_comments` is enabled
continue;
}
// Checking if pointers were initialized.
if ($prev !== null) {
// Checking if a new clause started.
if (static::isClause($prev) !== false) {
$lastClause = $prev->value;
$formattedOptions = false;
}
// The options of a clause should stay on the same line and everything that follows.
if (
$this->options['parts_newline']
&& ! $formattedOptions
&& empty(self::$INLINE_CLAUSES[$lastClause])
&& (
$curr->type !== Token::TYPE_KEYWORD
|| (
$curr->type === Token::TYPE_KEYWORD
&& $curr->flags & Token::FLAG_KEYWORD_FUNCTION
)
)
) {
$formattedOptions = true;
$lineEnded = true;
++$indent;
}
// Checking if this clause ended.
$isClause = static::isClause($curr);
if ($isClause) {
if (
($isClause === 2 || $this->options['clause_newline'])
&& empty(self::$SHORT_CLAUSES[$lastClause])
) {
$lineEnded = true;
if ($this->options['parts_newline'] && $indent > 0) {
--$indent;
}
}
}
// Inline JOINs
if (
($prev->type === Token::TYPE_KEYWORD && isset(JoinKeyword::$JOINS[$prev->value]))
|| (in_array($curr->value, ['ON', 'USING'], true)
&& isset(JoinKeyword::$JOINS[$list->tokens[$list->idx - 2]->value]))
|| isset($list->tokens[$list->idx - 4], JoinKeyword::$JOINS[$list->tokens[$list->idx - 4]->value])
|| isset($list->tokens[$list->idx - 6], JoinKeyword::$JOINS[$list->tokens[$list->idx - 6]->value])
) {
$lineEnded = false;
}
// Indenting BEGIN ... END blocks.
if ($prev->type === Token::TYPE_KEYWORD && $prev->keyword === 'BEGIN') {
$lineEnded = true;
$blocksIndentation[] = $indent;
++$indent;
} elseif ($curr->type === Token::TYPE_KEYWORD && $curr->keyword === 'END') {
$lineEnded = true;
$indent = array_pop($blocksIndentation);
}
// Formatting fragments delimited by comma.
if ($prev->type === Token::TYPE_OPERATOR && $prev->value === ',') {
// Fragments delimited by a comma are broken into multiple
// pieces only if the clause is not inlined or this fragment
// is between brackets that are on new line.
if (
end($blocksLineEndings) === true
|| (
empty(self::$INLINE_CLAUSES[$lastClause])
&& ! $shortGroup
&& $this->options['parts_newline']
)
) {
$lineEnded = true;
}
}
// Handling brackets.
// Brackets are indented only if the length of the fragment between
// them is longer than 30 characters.
if ($prev->type === Token::TYPE_OPERATOR && $prev->value === '(') {
$blocksIndentation[] = $indent;
$shortGroup = true;
if (static::getGroupLength($list) > 30) {
++$indent;
$lineEnded = true;
$shortGroup = false;
}
$blocksLineEndings[] = $lineEnded;
} elseif ($curr->type === Token::TYPE_OPERATOR && $curr->value === ')') {
$indent = array_pop($blocksIndentation);
$lineEnded |= array_pop($blocksLineEndings);
$shortGroup = false;
}
// Adding the token.
$ret .= $this->toString($prev);
// Finishing the line.
if ($lineEnded) {
$ret .= $this->options['line_ending']
. str_repeat($this->options['indentation'], (int) $indent);
$lineEnded = false;
} else {
// If the line ended there is no point in adding whitespaces.
// Also, some tokens do not have spaces before or after them.
if (
// A space after delimiters that are longer than 2 characters.
$prev->keyword === 'DELIMITER'
|| ! (
($prev->type === Token::TYPE_OPERATOR && ($prev->value === '.' || $prev->value === '('))
// No space after . (
|| ($curr->type === Token::TYPE_OPERATOR
&& ($curr->value === '.' || $curr->value === ','
|| $curr->value === '(' || $curr->value === ')'))
// No space before . , ( )
|| $curr->type === Token::TYPE_DELIMITER && mb_strlen((string) $curr->value, 'UTF-8') < 2
)
) {
$ret .= ' ';
}
}
}
// Iteration finished, consider current token as previous.
$prev = $curr;
}
if ($this->options['type'] === 'cli') {
return $ret . "\x1b[0m";
}
return $ret;
}
public function escapeConsole(string $string): string
{
return str_replace(
[
"\x00",
"\x01",
"\x02",
"\x03",
"\x04",
"\x05",
"\x06",
"\x07",
"\x08",
"\x09",
"\x0A",
"\x0B",
"\x0C",
"\x0D",
"\x0E",
"\x0F",
"\x10",
"\x11",
"\x12",
"\x13",
"\x14",
"\x15",
"\x16",
"\x17",
"\x18",
"\x19",
"\x1A",
"\x1B",
"\x1C",
"\x1D",
"\x1E",
"\x1F",
],
[
'\x00',
'\x01',
'\x02',
'\x03',
'\x04',
'\x05',
'\x06',
'\x07',
'\x08',
'\x09',
'\x0A',
'\x0B',
'\x0C',
'\x0D',
'\x0E',
'\x0F',
'\x10',
'\x11',
'\x12',
'\x13',
'\x14',
'\x15',
'\x16',
'\x17',
'\x18',
'\x19',
'\x1A',
'\x1B',
'\x1C',
'\x1D',
'\x1E',
'\x1F',
],
$string
);
}
/**
* Tries to print the query and returns the result.
*
* @param Token $token the token to be printed
*
* @return string
*/
public function toString($token)
{
$text = $token->token;
static $prev;
foreach ($this->options['formats'] as $format) {
if ($token->type !== $format['type'] || ! (($token->flags & $format['flags']) === $format['flags'])) {
continue;
}
// Running transformation function.
if (! empty($format['function'])) {
$func = $format['function'];
$text = $func($text);
}
// Formatting HTML.
if ($this->options['type'] === 'html') {
return '<span ' . $format['html'] . '>' . htmlspecialchars($text, ENT_NOQUOTES) . '</span>';
}
if ($this->options['type'] === 'cli') {
if ($prev !== $format['cli']) {
$prev = $format['cli'];
return $format['cli'] . $this->escapeConsole($text);
}
return $this->escapeConsole($text);
}
break;
}
if ($this->options['type'] === 'cli') {
if ($prev !== "\x1b[39m") {
$prev = "\x1b[39m";
return "\x1b[39m" . $this->escapeConsole($text);
}
return $this->escapeConsole($text);
}
if ($this->options['type'] === 'html') {
return htmlspecialchars($text, ENT_NOQUOTES);
}
return $text;
}
/**
* Formats a query.
*
* @param string $query The query to be formatted
* @param array<string, bool|string|array<int, array<string, int|string>>> $options the formatting options
*
* @return string the formatted string
*/
public static function format($query, array $options = [])
{
$lexer = new Lexer($query);
$formatter = new self($options);
return $formatter->formatList($lexer->list);
}
/**
* Computes the length of a group.
*
* A group is delimited by a pair of brackets.
*
* @param TokensList $list the list of tokens
*
* @return int
*/
public static function getGroupLength($list)
{
/**
* The number of opening brackets found.
* This counter starts at one because by the time this function called,
* the list already advanced one position and the opening bracket was
* already parsed.
*
* @var int
*/
$count = 1;
/**
* The length of this group.
*
* @var int
*/
$length = 0;
for ($idx = $list->idx; $idx < $list->count; ++$idx) {
// Counting the brackets.
if ($list->tokens[$idx]->type === Token::TYPE_OPERATOR) {
if ($list->tokens[$idx]->value === '(') {
++$count;
} elseif ($list->tokens[$idx]->value === ')') {
--$count;
if ($count === 0) {
break;
}
}
}
// Keeping track of this group's length.
$length += mb_strlen((string) $list->tokens[$idx]->value, 'UTF-8');
}
return $length;
}
/**
* Checks if a token is a statement or a clause inside a statement.
*
* @param Token $token the token to be checked
*
* @return int|bool
*/
public static function isClause($token)
{
if (
($token->type === Token::TYPE_KEYWORD && isset(Parser::$STATEMENT_PARSERS[$token->keyword]))
|| ($token->type === Token::TYPE_NONE && strtoupper($token->token) === 'DELIMITER')
) {
return 2;
}
if ($token->type === Token::TYPE_KEYWORD && isset(Parser::$KEYWORD_PARSERS[$token->keyword])) {
return 1;
}
return false;
}
}