sourcemod/tools/language_check/smc_parser.py

#!/usr/bin/python3

# BSD Zero Clause License
#
# Copyright (C) 2023 by nosoop
#
# Permission to use, copy, modify, and/or distribute this software for any purpose with or
# without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
# SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
# OR PERFORMANCE OF THIS SOFTWARE.

# https://gist.github.com/nosoop/8c6ccaec11b1d33340bec8dbc8096658
import collections
import enum
import itertools


class SMCOperation(enum.Enum):
    STRING = 1
    SUBSECTION_START = 2
    SUBSECTION_END = 3
    COMMENT = 4
    COMMENT_MULTILINE = 5
    KEYVALUE = 6


# https://stackoverflow.com/a/70762559
def takewhile_inclusive(predicate, it):
    for x in it:
        if predicate(x):
            yield x
        else:
            yield x
            break


def _is_whitespace(ch):
    return ch in (' ', '\t', '\n', '\r')


def _smc_stream_skip_whitespace(stream):
    # consumes whitespace and returns the first non-whitespace character if any, or None if EOS
    values = tuple(takewhile_inclusive(_is_whitespace, stream))
    if not values:
        return None
    *ws, last = values
    if not ws and not _is_whitespace(last):
        return last
    return last if ws and not _is_whitespace(last) else None


def _smc_stream_extract_multiline_comment(stream):
    while True:
        yield from itertools.takewhile(lambda ch: ch != '*', stream)
        ch = next(stream, None)
        if ch == '/':
            return
        yield '*'
        yield ch


_escape_mapping = str.maketrans({
    '"': '"',
    'n': '\n',
    'r': '\r',
    't': '\t',
    '\\': '\\',
})


def _smc_stream_extract_string(stream):
    for ch in stream:
        if ch == "\\":
            ch = next(stream).translate(_escape_mapping)
        elif ch == '"':
            return
        yield ch


def parse_smc_string(data):
    stream = iter(data)
    while True:
        ch = _smc_stream_skip_whitespace(stream)
        if ch is None:
            return
        elif ch == '"':
            # consume until the next quote, then determine if:
            # - the string marks the subsection name '{'
            # - we have another string to consume, making this a key / value pair
            key = ''.join(_smc_stream_extract_string(stream))

            ch = _smc_stream_skip_whitespace(stream)
            if ch == '{':
                yield SMCOperation.SUBSECTION_START, key
            elif ch == '"':
                value = ''.join(_smc_stream_extract_string(stream))
                yield SMCOperation.KEYVALUE, key, value
            else:
                raise ValueError(
                    f"Unexpected character {ch.encode('ascii', 'backslashreplace')} after end of string"
                )
        elif ch == '}':
            yield SMCOperation.SUBSECTION_END, None
        elif ch == '/':
            ch = next(stream)
            if ch == '/':
                # single line comment: consume until the end of the line
                value = ''.join(
                    itertools.takewhile(lambda ch: ch != '\n', stream))
                yield SMCOperation.COMMENT, value
            elif ch == '*':
                # multi line comment: consume until the sequence '*/' is reached
                value = ''.join(_smc_stream_extract_multiline_comment(stream))
                yield SMCOperation.COMMENT_MULTILINE, value
            else:
                raise ValueError(
                    f"Unexpected character {ch.encode('ascii', 'backslashreplace')} at start of comment"
                )
        else:
            raise ValueError(
                f"Unexpected character {ch.encode('ascii', 'backslashreplace')}"
            )


class MultiKeyDict(collections.defaultdict):
    # a dict that supports supports one-to-many mappings
    # init by passing keys pointing to a list of values
    def __init__(self, *args, **kwargs):
        super().__init__(list, *args, **kwargs)

    # yields a key, value pair for every array item associated with a key
    def items(self):
        yield from ((k, iv) for k, v in super().items() for iv in v)


def smc_string_to_dict(data):
    # returns a multidict instance
    root_node = MultiKeyDict()
    contexts = [root_node]
    for event, *info in parse_smc_string(data):
        if event == SMCOperation.SUBSECTION_START:
            key, *_ = info
            subkey = MultiKeyDict()
            contexts[-1][key].append(subkey)
            contexts.append(subkey)
        elif event == SMCOperation.SUBSECTION_END:
            contexts.pop()
        elif event == SMCOperation.KEYVALUE:
            key, value = info
            contexts[-1][key].append(value)
    return root_node


def main():
    SMC_STRING = """
	"thing"
	{
		// this is a comment node
		"key"	"value"

		"subthing"
		{
			// and another
			"subthing key"		"subthing value"
			"subthing key"		"duplicate key value"
		}
		"subthing"
		{
			"duplicate subthing" "yes"
		}

		/**
		 * this is a multiline comment node
		 */
		"another key"	"another value"
	}
	"""

    # sections = []
    # for event, *data in parse_smc_string(SMC_STRING):
    # 	print(event, data, tuple(sections))
    # 	if event == SMCOperation.SUBSECTION_START:
    # 		section, *_ = data
    # 		sections.append(section)
    # 	elif event == SMCOperation.SUBSECTION_END:
    # 		sections.pop()
    # assert(not sections)

    import json
    import pathlib
    # print(json.dumps(smc_string_to_dict(SMC_STRING), indent=4))
    for f in pathlib.Path('translations').rglob('*.txt'):
        print(f)
        print(json.dumps(smc_string_to_dict(f.read_text('utf8')), indent = 4))


if __name__ == "__main__":
    main()