Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
import argparse | |
import json | |
import re | |
import sys | |
# whitespace is constrained to a single space char to prevent model "running away" in | |
# whitespace. Also maybe improves generation quality? | |
SPACE_RULE = '" "?' | |
PRIMITIVE_RULES = { | |
'boolean': '("true" | "false") space', | |
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', | |
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space', | |
'string': r''' "\"" ( | |
[^"\\] | | |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) | |
)* "\"" space ''', | |
'null': '"null" space', | |
} | |
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+') | |
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') | |
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'} | |
class SchemaConverter: | |
def __init__(self, prop_order): | |
self._prop_order = prop_order | |
self._rules = {'space': SPACE_RULE} | |
def _format_literal(self, literal): | |
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( | |
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal) | |
) | |
return f'"{escaped}"' | |
def _add_rule(self, name, rule): | |
esc_name = INVALID_RULE_CHARS_RE.sub('-', name) | |
if esc_name not in self._rules or self._rules[esc_name] == rule: | |
key = esc_name | |
else: | |
i = 0 | |
while f'{esc_name}{i}' in self._rules: | |
i += 1 | |
key = f'{esc_name}{i}' | |
self._rules[key] = rule | |
return key | |
def visit(self, schema, name): | |
schema_type = schema.get('type') | |
rule_name = name or 'root' | |
if 'oneOf' in schema or 'anyOf' in schema: | |
rule = ' | '.join(( | |
self.visit(alt_schema, f'{name}{"-" if name else ""}{i}') | |
for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf']) | |
)) | |
return self._add_rule(rule_name, rule) | |
elif 'const' in schema: | |
return self._add_rule(rule_name, self._format_literal(schema['const'])) | |
elif 'enum' in schema: | |
rule = ' | '.join((self._format_literal(v) for v in schema['enum'])) | |
return self._add_rule(rule_name, rule) | |
elif schema_type == 'object' and 'properties' in schema: | |
# TODO: `required` keyword | |
prop_order = self._prop_order | |
prop_pairs = sorted( | |
schema['properties'].items(), | |
# sort by position in prop_order (if specified) then by key | |
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), | |
) | |
rule = '"{" space' | |
for i, (prop_name, prop_schema) in enumerate(prop_pairs): | |
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') | |
if i > 0: | |
rule += ' "," space' | |
rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}' | |
rule += ' "}" space' | |
return self._add_rule(rule_name, rule) | |
elif schema_type == 'array' and 'items' in schema: | |
# TODO `prefixItems` keyword | |
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') | |
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' | |
return self._add_rule(rule_name, rule) | |
else: | |
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' | |
return self._add_rule( | |
'root' if rule_name == 'root' else schema_type, | |
PRIMITIVE_RULES[schema_type] | |
) | |
def format_grammar(self): | |
return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items())) | |
def main(args_in = None): | |
parser = argparse.ArgumentParser( | |
description=''' | |
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a | |
given JSON schema. Only a subset of JSON schema features are supported; more may be | |
added in the future. | |
''', | |
) | |
parser.add_argument( | |
'--prop-order', | |
default=[], | |
type=lambda s: s.split(','), | |
help=''' | |
comma-separated property names defining the order of precedence for object properties; | |
properties not specified here are given lower precedence than those that are, and are | |
sorted alphabetically | |
''' | |
) | |
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)') | |
args = parser.parse_args(args_in) | |
schema = json.load(sys.stdin if args.schema == '-' else open(args.schema)) | |
prop_order = {name: idx for idx, name in enumerate(args.prop_order)} | |
converter = SchemaConverter(prop_order) | |
converter.visit(schema, '') | |
print(converter.format_grammar()) | |
if __name__ == '__main__': | |
main() | |