sourcemod/tools/language_check/smc_parser.py
Kyle Sanderson 48150e0c7a
Bring languages into the tree (#1625)
* translations: bring languages into tree

* Update translation phrases changed since 2021

* Update packaging script to include all translations

* Update languages.cfg

* Add Latin American Spanish translations

This is a copy of spanish for now.

* Ignore "en" when looking for translation folders

English is the default and doesn't use a subfolder.

* Only add each translation folder once

Korean "ko" is in there twice.

* Compare language coverage to english

All phrases are compared to the english baseline files and any differences
are reported. The differences are pushed to a Github Project as well for
an easier overview.

Thank you to @nosoop for sharing the Python SMC parser!

* Add link to README

---------

Co-authored-by: Peace-Maker <peace-maker@wcfan.de>
2023-03-29 16:23:05 +02:00

204 lines
5.9 KiB
Python

#!/usr/bin/python3
# BSD Zero Clause License
#
# Copyright (C) 2023 by nosoop
#
# Permission to use, copy, modify, and/or distribute this software for any purpose with or
# without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
# SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
# OR PERFORMANCE OF THIS SOFTWARE.
# https://gist.github.com/nosoop/8c6ccaec11b1d33340bec8dbc8096658
import collections
import enum
import itertools
class SMCOperation(enum.Enum):
STRING = 1
SUBSECTION_START = 2
SUBSECTION_END = 3
COMMENT = 4
COMMENT_MULTILINE = 5
KEYVALUE = 6
# https://stackoverflow.com/a/70762559
def takewhile_inclusive(predicate, it):
for x in it:
if predicate(x):
yield x
else:
yield x
break
def _is_whitespace(ch):
return ch in (' ', '\t', '\n', '\r')
def _smc_stream_skip_whitespace(stream):
# consumes whitespace and returns the first non-whitespace character if any, or None if EOS
values = tuple(takewhile_inclusive(_is_whitespace, stream))
if not values:
return None
*ws, last = values
if not ws and not _is_whitespace(last):
return last
return last if ws and not _is_whitespace(last) else None
def _smc_stream_extract_multiline_comment(stream):
while True:
yield from itertools.takewhile(lambda ch: ch != '*', stream)
ch = next(stream, None)
if ch == '/':
return
yield '*'
yield ch
_escape_mapping = str.maketrans({
'"': '"',
'n': '\n',
'r': '\r',
't': '\t',
'\\': '\\',
})
def _smc_stream_extract_string(stream):
for ch in stream:
if ch == "\\":
ch = next(stream).translate(_escape_mapping)
elif ch == '"':
return
yield ch
def parse_smc_string(data):
stream = iter(data)
while True:
ch = _smc_stream_skip_whitespace(stream)
if ch is None:
return
elif ch == '"':
# consume until the next quote, then determine if:
# - the string marks the subsection name '{'
# - we have another string to consume, making this a key / value pair
key = ''.join(_smc_stream_extract_string(stream))
ch = _smc_stream_skip_whitespace(stream)
if ch == '{':
yield SMCOperation.SUBSECTION_START, key
elif ch == '"':
value = ''.join(_smc_stream_extract_string(stream))
yield SMCOperation.KEYVALUE, key, value
else:
raise ValueError(
f"Unexpected character {ch.encode('ascii', 'backslashreplace')} after end of string"
)
elif ch == '}':
yield SMCOperation.SUBSECTION_END, None
elif ch == '/':
ch = next(stream)
if ch == '/':
# single line comment: consume until the end of the line
value = ''.join(
itertools.takewhile(lambda ch: ch != '\n', stream))
yield SMCOperation.COMMENT, value
elif ch == '*':
# multi line comment: consume until the sequence '*/' is reached
value = ''.join(_smc_stream_extract_multiline_comment(stream))
yield SMCOperation.COMMENT_MULTILINE, value
else:
raise ValueError(
f"Unexpected character {ch.encode('ascii', 'backslashreplace')} at start of comment"
)
else:
raise ValueError(
f"Unexpected character {ch.encode('ascii', 'backslashreplace')}"
)
class MultiKeyDict(collections.defaultdict):
# a dict that supports supports one-to-many mappings
# init by passing keys pointing to a list of values
def __init__(self, *args, **kwargs):
super().__init__(list, *args, **kwargs)
# yields a key, value pair for every array item associated with a key
def items(self):
yield from ((k, iv) for k, v in super().items() for iv in v)
def smc_string_to_dict(data):
# returns a multidict instance
root_node = MultiKeyDict()
contexts = [root_node]
for event, *info in parse_smc_string(data):
if event == SMCOperation.SUBSECTION_START:
key, *_ = info
subkey = MultiKeyDict()
contexts[-1][key].append(subkey)
contexts.append(subkey)
elif event == SMCOperation.SUBSECTION_END:
contexts.pop()
elif event == SMCOperation.KEYVALUE:
key, value = info
contexts[-1][key].append(value)
return root_node
def main():
SMC_STRING = """
"thing"
{
// this is a comment node
"key" "value"
"subthing"
{
// and another
"subthing key" "subthing value"
"subthing key" "duplicate key value"
}
"subthing"
{
"duplicate subthing" "yes"
}
/**
* this is a multiline comment node
*/
"another key" "another value"
}
"""
# sections = []
# for event, *data in parse_smc_string(SMC_STRING):
# print(event, data, tuple(sections))
# if event == SMCOperation.SUBSECTION_START:
# section, *_ = data
# sections.append(section)
# elif event == SMCOperation.SUBSECTION_END:
# sections.pop()
# assert(not sections)
import json
import pathlib
# print(json.dumps(smc_string_to_dict(SMC_STRING), indent=4))
for f in pathlib.Path('translations').rglob('*.txt'):
print(f)
print(json.dumps(smc_string_to_dict(f.read_text('utf8')), indent = 4))
if __name__ == "__main__":
main()