diff --git a/tools/standardTextToCode/DummyReaderAndBuildFiles/CMakeLists.txt b/tools/standardTextToCode/DummyReaderAndBuildFiles/CMakeLists.txt new file mode 100644 index 000000000..9e7cc401d --- /dev/null +++ b/tools/standardTextToCode/DummyReaderAndBuildFiles/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.10) +project(ParsingTestApp) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Collect all .cpp files in the current directory +file(GLOB SOURCES "*.cpp") + +# Change to create a library instead of executable +# Remove the add_executable line and replace with: +add_library(ParsingTestApp ${SOURCES}) diff --git a/tools/standardTextToCode/DummyReaderAndBuildFiles/SubByteReaderDummy.h b/tools/standardTextToCode/DummyReaderAndBuildFiles/SubByteReaderDummy.h new file mode 100644 index 000000000..199066dd2 --- /dev/null +++ b/tools/standardTextToCode/DummyReaderAndBuildFiles/SubByteReaderDummy.h @@ -0,0 +1,24 @@ + +#pragma once + +#include + +namespace parser +{ + +class SubByteReaderDummy +{ +public: + SubByteReaderDummy() = default; + + bool byte_aligned() const { return false; } + + bool readFlag(const std::string &name) { return false; } + int readBits(const std::string &name, const int numBits) { return 0; } + int readBitsSigned(const std::string &name, const int numBits) { return 0; } + unsigned readUEV(const std::string &name) { return 0; } + int readSEV(const std::string &name) { return 0; } + std::string readString(const std::string &name) { return ""; } +}; + +} // namespace parser diff --git a/tools/standardTextToCode/DummyReaderAndBuildFiles/Units.h b/tools/standardTextToCode/DummyReaderAndBuildFiles/Units.h new file mode 100644 index 000000000..d5aadca53 --- /dev/null +++ b/tools/standardTextToCode/DummyReaderAndBuildFiles/Units.h @@ -0,0 +1,22 @@ +#pragma once + +namespace parser +{ + +class NalUnit +{ +public: + NalUnit() = default; + + // This could contain file position info, nal unit type info, etc... +}; + +class SEI +{ +public: + SEI() = default; + + // This could contain a type as an enum etc... +}; + +} // namespace parser diff --git a/tools/standardTextToCode/README.md b/tools/standardTextToCode/README.md index 6286d6e0a..dcedf3e0d 100644 --- a/tools/standardTextToCode/README.md +++ b/tools/standardTextToCode/README.md @@ -11,3 +11,7 @@ Read the `docx` document and parse all the tables (e.g `VPS`, `SPS`, ...). Put t ### writeTablesC++.py Take the pickled file and write CPP classes from it. This can be used as the starting point to create the actual parsing code. Or it can be used to add new parsing code which was added in a later version of a standard. + +### Prerequirements + +Install python-docx (https://github.com/python-openxml/python-docx). \ No newline at end of file diff --git a/tools/standardTextToCode/codingType.py b/tools/standardTextToCode/codingType.py index a49087c13..5eb602c9e 100644 --- a/tools/standardTextToCode/codingType.py +++ b/tools/standardTextToCode/codingType.py @@ -6,7 +6,11 @@ class Coding(Enum): UNSIGNED_FIXED = auto() # u(x) - unsigned int with a fixed number of bits UNSIGNED_VARIABLE = auto() # u(v) - unsigned int with a variable number of bits UNSIGNED_EXP = auto() # ue(v) - unsigned int wit exp golomb coding + SIGNED_FIXED = auto() # i(x) - signed int with a fixed number of bits using twos complement SIGNED_EXP = auto() # se(v) - signed int with exp golomb coding + BYTE = auto() # b(8) - a byte (8 bits) + STRING = auto() # st(x) - null terminated string in ISO/IEC 10646 UCS + UNKNOWN = auto() def isCodingType(descriptor : str): try: @@ -20,30 +24,48 @@ def __init__(self, descriptor : str): self.length = 0 self.codingType = None self.parseCodingType(descriptor) + def parseCodingType(self, descriptor : str): - if (descriptor.startswith("f(")): + if descriptor.startswith("f("): self.codingType = Coding.FIXED_CODE self.length = int(descriptor[2:-1]) - elif (descriptor.startswith("u(v)")): + elif descriptor.startswith("u(v)"): self.codingType = Coding.UNSIGNED_VARIABLE - elif (descriptor.startswith("u(")): + elif descriptor.startswith("u("): self.codingType = Coding.UNSIGNED_FIXED self.length = int(descriptor[2:-1]) - elif (descriptor.startswith("ue(v)")): + elif descriptor.startswith("ue(v)"): self.codingType = Coding.UNSIGNED_EXP - elif (descriptor.startswith("se(v)")): + elif descriptor.startswith("se(v)"): self.codingType = Coding.SIGNED_EXP + elif descriptor.startswith("b(8)"): + # Used in e.g. VSEI standard + self.codingType = Coding.BYTE + elif descriptor.startswith("i("): + self.codingType = Coding.SIGNED_FIXED + self.length = int(descriptor[2:-1]) + elif descriptor.startswith("st(v)"): + self.codingType = Coding.STRING else: - raise SyntaxError("Unknown descriptor type " + descriptor) + self.codingType = Coding.UNKNOWN + def __str__(self): - if (self.codingType == Coding.FIXED_CODE): + if self.codingType == Coding.FIXED_CODE: return f"f({self.length})" - if (self.codingType == Coding.UNSIGNED_VARIABLE): + if self.codingType == Coding.UNSIGNED_VARIABLE: return "u(v)" - if (self.codingType == Coding.UNSIGNED_FIXED): + if self.codingType == Coding.UNSIGNED_FIXED: return f"u({self.length})" - if (self.codingType == Coding.UNSIGNED_EXP): + if self.codingType == Coding.UNSIGNED_EXP: return "ue(v)" - if (self.codingType == Coding.SIGNED_EXP): + if self.codingType == Coding.SIGNED_EXP: return "se(v)" + if self.codingType == Coding.BYTE: + return "b(8)" + if self.codingType == Coding.SIGNED_FIXED: + return f"i({self.length})" + if self.codingType == Coding.STRING: + return "st(v)" + if self.codingType == Coding.UNKNOWN: + return "unknown" return "Err" diff --git a/tools/standardTextToCode/parseTables.py b/tools/standardTextToCode/parseTables.py index 384da360b..ba52d296b 100644 --- a/tools/standardTextToCode/parseTables.py +++ b/tools/standardTextToCode/parseTables.py @@ -1,26 +1,35 @@ from codingType import Coding, CodingType, isCodingType import re +from enum import Enum, unique, auto -def isVariableName(text : str): + +def isVariableName(text: str): if ("[" in text and "]" in text): text = text.split("[")[0] # Array indices are ok return re.fullmatch("[a-z][a-z0-9]*(_[a-z0-9]+)+", text) -def isFunctionCall(text : str): + + +def isFunctionCall(text: str): if (not "(" in text or not ")" in text): return False return isVariableName(text.split("(")[0].strip()) -def removeComments(text : str): + + +def removeComments(text: str): commentStart = text.find("/*") while (commentStart != -1): commentEnd = text.find("*/") if (commentEnd == -1): return text if (commentEnd <= commentStart): - raise SyntaxError("Error removing comment. End before start. Line: " + text) + raise SyntaxError( + "Error removing comment. End before start. Line: " + text) text = text[0:commentStart] + text[commentEnd+2:] commentStart = text.find("/*") return text.strip() -def cleanCondition(text : str): + + +def cleanCondition(text: str): text = text.strip() text = text.replace("=\xa0=", "==") text = text.replace("|\xa0|", "||") @@ -33,9 +42,12 @@ def cleanCondition(text : str): text = text.replace("\u2212", "-") text = text.replace('−', '-') if (text.find("\xa0") != -1): - raise SyntaxError("There still is a char to replace in the condition. This must be cleaned up first.") + raise SyntaxError( + "There still is a char to replace in the condition. This must be cleaned up first.") return text -def cleanArgument(text : str): + + +def cleanArgument(text: str): text = text.strip() text = text.replace("[\xa0", "[") text = text.replace("\xa0]", "]") @@ -43,9 +55,12 @@ def cleanArgument(text : str): text = text.replace("\u2212", "-") text = text.replace('−', '-') if (text.find("\xa0") != -1): - raise SyntaxError("There still is a char to replace in the argument. This must be cleaned up first.") + raise SyntaxError( + "There still is a char to replace in the argument. This must be cleaned up first.") return text -def cleanComment(text : str): + + +def cleanComment(text: str): text = text.strip() text = text.replace("=\xa0=", "==") text = text.replace("[\xa0", "[") @@ -58,25 +73,33 @@ def cleanComment(text : str): text = text.replace("\u2212", "-") text = text.replace('−', '-') if (text.find("\xa0") != -1): - raise SyntaxError("There still is a char to replace in the comment. This must be cleaned up first.") + raise SyntaxError( + "There still is a char to replace in the comment. This must be cleaned up first.") return text -def cleanConditionPart(text : str): + + +def cleanConditionPart(text: str): text = text.strip() text = text.replace("\xa0−\xa0", " - ") text = text.replace('−', '-') if (text.find("\xa0") != -1): - raise SyntaxError("There still is a char to replace in the condition. This must be cleaned up first.") + raise SyntaxError( + "There still is a char to replace in the condition. This must be cleaned up first.") return text -def cleanIncrement(text : str): + + +def cleanIncrement(text: str): text = text.strip() text = text.replace("-\xa0-", "--") text = text.replace("−\xa0−", "--") text = text.replace('−', '-') if (text.find("\xa0") != -1): - raise SyntaxError("There still is a char to replace in the increment. This must be cleaned up first.") + raise SyntaxError( + "There still is a char to replace in the increment. This must be cleaned up first.") return text -def getEntryType(text : str): + +def getEntryType(text: str): text = removeComments(text) if isVariableName(text): return "Variable" @@ -91,10 +114,19 @@ def getEntryType(text : str): if text.startswith("do"): return "do" + +def tryFindVariableDescription(name, variableDescriptions): + for description in variableDescriptions: + if name in description.names: + return description + return None + + class ParsingItem: def __init__(self, parent): self.parent = parent - + + class Variable(ParsingItem): def __init__(self, parent): super().__init__(parent) @@ -102,14 +134,16 @@ def __init__(self, parent): self.arrayIndex = None self.coding = None self.description = None - def fromText(self, name : str, descriptor : str, variableDescriptions : dict): + + def fromText(self, name: str, descriptor: str, variableDescriptions: dict): if ("[" in name and "]" in name): self.arrayIndex = [] openBracket = name.find("[") self.name = name[0:openBracket] while (True): closeBracket = name.find("]") - newIndex = cleanArgument(name[openBracket+1:closeBracket].strip()) + newIndex = cleanArgument( + name[openBracket+1:closeBracket].strip()) self.arrayIndex.append(newIndex) name = name[closeBracket+1:] openBracket = name.find("[") @@ -117,9 +151,10 @@ def fromText(self, name : str, descriptor : str, variableDescriptions : dict): break else: self.name = name - if (self.name in variableDescriptions): - self.description = variableDescriptions[self.name] + self.description = tryFindVariableDescription( + self.name, variableDescriptions) self.coding = CodingType(descriptor) + def __str__(self): s = "" for _ in range(self.parent.depth): @@ -129,24 +164,29 @@ def __str__(self): s += str(self.arrayIndex) return f"{s} --> {self.coding}" + class CommentEntry(ParsingItem): def __init__(self, parent): super().__init__(parent) self.text = None - def fromText(self, text : str): + + def fromText(self, text: str): self.text = cleanComment(text) + def __str__(self): s = "" for _ in range(self.parent.depth): s += " " return f"{s}//{self.text}" + class FunctionCall(ParsingItem): def __init__(self, parent): super().__init__(parent) self.functionName = None self.arguments = None - def fromText(self, name : str): + + def fromText(self, name: str): self.functionName = name.split("(")[0] self.arguments = [] for argument in (name.split("(")[1].split(")")[0].split(",")): @@ -154,141 +194,167 @@ def fromText(self, name : str): if (len(c) > 0): self.arguments.append(cleanArgument(argument)) debugStop = 234 + def __str__(self): spaces = "" for _ in range(self.parent.depth): spaces += " " return f"{spaces}{self.functionName}({self.arguments})" + class Container(ParsingItem): def __init__(self, parent): super().__init__(parent) self.children = [] - self.depth = 0 - self.depth = 0 - def parseChildren(self, table, tableIndex, variableDescriptions): - # Get the initial depth - t0_full = table.cell(0, tableIndex).text - while (t0_full.lstrip("\t").startswith("/*")): - # Ignore comments - tableIndex += 2 - t0_full = table.cell(0, tableIndex).text - self.depth = len(t0_full) - len(t0_full.lstrip("\t")) + self.depth = None + def parseChildren(self, table, rowIndex, currentDepth, variableDescriptions): try: while (True): - t0_full = table.cell(0, tableIndex).text - newDepth = len(t0_full) - len(t0_full.lstrip("\t")) - startsWithComment = t0_full.lstrip("\t").startswith("/*") - if (newDepth < self.depth and not startsWithComment): - # End of container - return tableIndex - if (newDepth > self.depth): - raise SyntaxError(f"The depth of the line is higher then the container depth. Line: {t0_full}") - t0 = t0_full.strip() - t1 = table.cell(0, tableIndex+1).text.strip() - entryType = getEntryType(t0) - - lastEntry = False - try: - t2 = table.cell(0, tableIndex+2).text.strip() - if (t2 == t1): - # Skip identical entries. This may be the aforementioned glitch. - tableIndex += 1 - except IndexError: - # No more data - lastEntry = True - tableIndex += 2 + rawSymbol = table.cell(rowIndex, 0).text + + startsWithComment = rawSymbol.lstrip("\t").startswith("/*") + newDepth = len(rawSymbol) - len(rawSymbol.lstrip("\t")) + if currentDepth == None: + currentDepth = newDepth + if newDepth < currentDepth and not startsWithComment: + return rowIndex + elif newDepth > currentDepth: + raise SyntaxError( + f"The depth of the line is higher then the container depth. This should only happen when entering a container (e.g. if, else, while). Symbol: {symbol}") + + symbol = table.cell(rowIndex, 0).text.strip() + coding = table.cell(rowIndex, 1).text.strip() + + entryType = getEntryType(symbol) + + # print(f"Parsing entry: {symbol}") if (entryType == "Variable"): v = Variable(self) - v.fromText(t0, t1, variableDescriptions) - #print(f"{v}") + v.fromText(symbol, coding, variableDescriptions) + # print(f"{v}") self.children.append(v) + rowIndex += 1 elif (entryType == "FunctionCall"): f = FunctionCall(self) - f.fromText(t0) - #print(f"{f}") + f.fromText(symbol) + # print(f"{f}") self.children.append(f) + rowIndex += 1 elif (entryType == "for"): f = ContainerFor(self) - f.fromText(t0) - #print(f"{f}") + f.fromText(symbol) + # print(f"{f}") self.children.append(f) - tableIndex = f.parseChildren(table, tableIndex, variableDescriptions) + rowIndex = f.parseChildren( + table, rowIndex + 1, currentDepth + 1, variableDescriptions) elif (entryType == "if"): i = ContainerIf(self) - i.fromText(t0) - #print(f"{i}") + i.fromText(symbol) + # print(f"{i}") self.children.append(i) - tableIndex = i.parseChildren(table, tableIndex, variableDescriptions) + rowIndex = i.parseChildren( + table, rowIndex + 1, currentDepth + 1, variableDescriptions) elif (entryType == "while"): w = ContainerWhile(self) - w.fromText(t0) - #print(f"{w}") + w.fromText(symbol) + # print(f"{w}") self.children.append(w) - tableIndex = w.parseChildren(table, tableIndex, variableDescriptions) + rowIndex = w.parseChildren( + table, rowIndex + 1, currentDepth + 1, variableDescriptions) elif (entryType == "do"): d = ContainerDo(self) - d.fromText(t0) - #print(d.getDoText()) - tableIndex = d.parseChildren(table, tableIndex, variableDescriptions) - tableIndex = d.parseClosingWhile(table, tableIndex) - #print(f"{d}") + d.fromText(symbol) + # print(d.getDoText()) + rowIndex = d.parseChildren( + table, rowIndex + 1, currentDepth + 1, variableDescriptions) + rowIndex = d.parseClosingWhile(table, rowIndex) + # print(f"{d}") self.children.append(d) elif (entryType == "comment"): c = CommentEntry(self) - c.fromText(t0) - #print(f"{c}") + c.fromText(symbol) + # print(f"{c}") self.children.append(c) + rowIndex += 1 elif (entryType != None): - raise SyntaxError(f"Entry type is unknown. Line: {t0_full}") + raise SyntaxError( + f"Entry type is unknown. Line: {symbol}") + elif symbol == "}": + rowIndex += 1 else: - if (t0.strip() != "}"): - c = CommentEntry(self) - c.fromText(t0) - #print(f"{c}") - self.children.append(c) - - if (lastEntry): - return tableIndex + c = CommentEntry(self) + c.fromText(symbol) + # print(f"{c}") + self.children.append(c) + rowIndex += 1 + + if (rowIndex + 1 >= len(table.rows)): + return rowIndex except Exception as ex: print(f"Error parsing {self}: {ex}") - return tableIndex + if hasattr(self, "name"): + print(f"In table {self.name}") + return rowIndex + + +@unique +class TableType(Enum): + NAL_UNIT = auto() # A full NAL unit + # An SEI message. This knows its payload size when reading. + SEI_MESSAGE = auto() + # An element (a function) that is part of an SEI or a NAL unit. + ELEMENT = auto() + class ContainerTable(Container): def __init__(self): super().__init__(None) + self.name = "" + self.type = None + self.arguments = None + def parseContainer(self, table, variableDescriptions): self.parseHeader(table.cell(0, 0).text) - t1 = table.cell(0, 1).text.strip() - t2 = table.cell(0, 2).text.strip() - if (t2 == t1): - self.parseChildren(table, 3, variableDescriptions) + if len(self.arguments) == 0: + self.type = TableType.NAL_UNIT + elif len(self.arguments) == 1 and self.arguments[0] == "payloadSize": + self.type = TableType.SEI_MESSAGE else: - self.parseChildren(table, 2, variableDescriptions) + self.type = TableType.ELEMENT + t1 = table.cell(0, 1).text.strip() + if not "descriptor" in table.cell(0, 1).text.strip().lower(): + print( + f"Warning: Table header column 2 does not contain 'descriptor' heading in table {self.name}") + self.parseChildren(table, 1, None, variableDescriptions) + def parseHeader(self, header): header = header.replace(u'\xa0', u' ') bracketOpen = header.find("(") bracketClose = header.find(")") + if bracketOpen == -1 or bracketClose == -1: + raise SyntaxError( + f"Table header does not contain brackets: {header}") self.name = header[:bracketOpen] self.arguments = [] - for a in header[bracketOpen+1 : bracketClose].split(","): + for a in header[bracketOpen+1: bracketClose].split(","): self.arguments.append(a.strip()) - print(f"Table: {self.name}") - + + class ContainerIf(Container): def __init__(self, parent): super().__init__(parent) self.condition = None self.isElseIf = False self.isElse = False - def fromText(self, text : str): + + def fromText(self, text: str): if (not text.startswith("if") and not text.startswith("else if") and not text.startswith("} else") and not text.startswith("else")): raise SyntaxError("If container does not start with if or else if") - if (text.startswith("else if")): + elif (text.startswith("else if") or text.startswith("} else if")): self.isElseIf = True - if (text.startswith("} else") or text.startswith("else")): + elif (text.startswith("} else") or text.startswith("else")): self.isElse = True return start = text.find("(") @@ -296,6 +362,7 @@ def fromText(self, text : str): if (start == -1 or end == -1): raise SyntaxError("If condition does not contain brackets") self.condition = cleanCondition(text[start+1:end]) + def __str__(self): spaces = "" for _ in range(self.parent.depth): @@ -306,11 +373,13 @@ def __str__(self): return f"{spaces}else if({self.condition})" return f"{spaces}if({self.condition})" + class ContainerWhile(Container): def __init__(self, parent): super().__init__(parent) self.condition = None - def fromText(self, text : str): + + def fromText(self, text: str): if (not text.startswith("while")): raise SyntaxError("While container does not start with while") start = text.find("(") @@ -318,51 +387,47 @@ def fromText(self, text : str): if (start == -1 or end == -1): raise SyntaxError("While loop does not contain brackets") self.condition = cleanCondition(text[start+1:end]) + def __str__(self): spaces = "" for _ in range(self.parent.depth): spaces += " " return f"{spaces}while({self.condition})" + class ContainerDo(Container): def __init__(self, parent): super().__init__(parent) self.condition = None - def fromText(self, text : str): + + def fromText(self, text: str): if (not text.startswith("do")): raise SyntaxError("Do container does not start with do") - def parseClosingWhile(self, table, tableIndex : int): - t0_full = table.cell(0, tableIndex).text - text = t0_full.strip() - if (not text.startswith("} while")): + + def parseClosingWhile(self, table, rowIndex: int): + symbol = table.cell(rowIndex, 0).text.strip() + if (not symbol.startswith("} while")): raise SyntaxError("do does not end with while") - start = text.find("(") - end = text.rfind(")") + start = symbol.find("(") + end = symbol.rfind(")") if (start == -1 or end == -1): raise SyntaxError("Do ... while loop does not contain brackets") - self.condition = cleanCondition(text[start+1:end]) - t1 = table.cell(0, tableIndex+1).text.strip() - try: - t2 = table.cell(0, tableIndex+2).text.strip() - if (t2 == t1): - # Skip identical entries. This may be the aforementioned glitch. - tableIndex += 1 - except IndexError: - # No more data - pass - tableIndex += 2 - return tableIndex + self.condition = cleanCondition(symbol[start+1:end]) + return rowIndex + 1 + def getDoText(self): spaces = "" for _ in range(self.parent.depth): spaces += " " return f"{spaces}do" + def __str__(self): spaces = "" for _ in range(self.parent.depth): spaces += " " return f"{spaces}while ({self.condition})" + class ContainerFor(Container): def __init__(self, parent): super().__init__(parent) @@ -370,39 +435,57 @@ def __init__(self, parent): self.initialValue = None self.breakCondition = None self.increment = None - def fromText(self, text : str): + + def fromText(self, text: str): split = text.split(";") if (not split[0].startswith("for")): - raise SyntaxError("For container does not start with for") - + raise SyntaxError(f"For container does not start with for - {text}") + if (len(split) != 3): + raise SyntaxError(f"For container does not have exactly three parts - {text}") + firstPart = split[0][split[0].find("(") + 1:] self.variableName = cleanConditionPart(firstPart.split("=")[0]) self.initialValue = cleanConditionPart(firstPart.split("=")[1]) self.breakCondition = cleanCondition(split[1]) self.increment = cleanIncrement(split[2][0:split[2].find(")")]) + def __str__(self): spaces = "" for _ in range(self.parent.depth): spaces += " " return f"{spaces}for({self.variableName} = {self.initialValue}; {self.breakCondition}; {self.increment})" + def parseDocumentTables(document, variableDescriptions): parsedTables = [] - firstLastEntry = ["nal_unit_header", "slice_data"] + startEntries = ["vui_parameters", "filler_payload"] + endEntries = ["vui_parameters", "reserved_message"] skipEntries = ["sei_rbsp"] - - firstEntryFound = False + + parsingEnabled = False for table in document.tables: - entryName = table.cell(0, 0).text.split("(")[0] - if (not firstEntryFound and entryName == firstLastEntry[0]): - firstEntryFound = True - if (firstEntryFound and entryName == firstLastEntry[1]): - break - if (entryName in skipEntries): + if len(table.rows) == 0 or len(table.columns) != 2: + continue + firstCell = table.cell(0, 0) + if firstCell.text == "Value" and firstCell.paragraphs[0].style.name == "Table_head": continue - if firstEntryFound: - tableItem = ContainerTable() - tableItem.parseContainer(table, variableDescriptions) - parsedTables.append(tableItem) - return parsedTables \ No newline at end of file + entryName = firstCell.text.split("(")[0] + if not parsingEnabled and entryName in startEntries: + parsingEnabled = True + if entryName in skipEntries or entryName.strip() == "" or entryName.startswith("Table"): + continue + if parsingEnabled: + try: + tableItem = ContainerTable() + tableItem.parseContainer(table, variableDescriptions) + if tableItem.name == "": + print("Warning: Table with empty name encountered. Ignoring Table.") + else: + print(f"Parsed Table: {tableItem.name}") + parsedTables.append(tableItem) + except Exception as ex: + print(f"Error parsing table {firstCell.text} - {ex}") + if (parsingEnabled and entryName in endEntries): + parsingEnabled = False + return parsedTables diff --git a/tools/standardTextToCode/parseVariableDescriptions.py b/tools/standardTextToCode/parseVariableDescriptions.py index 65d854ab4..02a856740 100644 --- a/tools/standardTextToCode/parseVariableDescriptions.py +++ b/tools/standardTextToCode/parseVariableDescriptions.py @@ -33,28 +33,34 @@ def __str__(self): class VariableDescription(): - def __init__(self, name): - self.name = name + def __init__(self, names): + self.names = names self.description = "" self.shallBe = None + self.variableParsingLength = None def __str__(self): if (self.shallBe): - return f"{self.name} -- {self.shallBe}" + return f"{self.names} -- {self.shallBe}" else: - return f"{self.name}" + return f"{self.names}" def finishReading(self): + if self.names == None or len(self.names) == 0: + print( + f"Warning: No names for description {self.description}. Ignoring.") + return self.cleanDescriptionText() self.lookForShallBe() + self.lookForVariableParsingLength() def cleanDescriptionText(self): self.description = self.description.replace(u'\xa0', u' ') def lookForShallBe(self): searchStrings = [] - searchStrings.append(self.name + " shall be ") - searchStrings.append("the value of " + self.name + " shall be ") + searchStrings.append(self.names[0] + " shall be ") + searchStrings.append("the value of " + self.names[0] + " shall be ") for searchString in searchStrings: posShallBe = self.description.lower().find(searchString) if (posShallBe != -1): @@ -92,7 +98,7 @@ def parseRestriction(self, restrictionText): posMaxStart = posMinEnd + len(" to ") posMaxEnd = restrictionText.find(", inclusive", posMaxStart) if (posMinEnd == -1 or posMaxStart == -1 or posMaxEnd == -1): - print("Error parsing range") + print(f"Error parsing range: {restrictionText}") return self.shallBe = VariableRestrictionRage( restrictionText[conditionStart: posMinEnd], restrictionText[posMaxStart: posMaxEnd]) @@ -106,59 +112,84 @@ def parseRestriction(self, restrictionText): self.shallBe = VariableRestrictionEqualTo( restrictionText[conditionStart:]) else: - assert(False) + assert (False) if (conditionIndex in [2, 4]): print("Warning: Using then in a comparison.") + def lookForVariableParsingLength(self): + descriptionLower = self.description.lower() + lengthFormulations = [("the number of bits used for ", "syntax element is"), + ("the length of the ", "syntax element is"), ("the length of ", " is ")] + + for partA, partB in lengthFormulations: + posSentenceStart = descriptionLower.find(partA) + if posSentenceStart == -1: + continue + posNextPeriod = descriptionLower.find(".", posSentenceStart) + posTextBeforeValue = descriptionLower.find(partB, posSentenceStart) + if posNextPeriod == -1 or posTextBeforeValue == -1 or posNextPeriod < posTextBeforeValue: + continue + + self.variableParsingLength = self.description[posTextBeforeValue + + len(partB):posNextPeriod].strip() + if self.variableParsingLength.endswith(" bits"): + self.variableParsingLength = self.variableParsingLength[:-len(" bits")] + + return -def parseDocForVariableDescriptions(document): - firstLastEntry = ["NAL unit header semantics", "Slice data semantics"] - firstEntryFound = False - variableDescriptions = dict() + +def extractVariableNamesFromRuns(runs): + # It can happen that the variable description is split over multiple + # runs. It will all be bold without spaces and can just be concatenated. + # Please don't ask me why. + names = [] + currentName = "" + for run in runs: + text = run.text.strip() + if run.font.bold and not " " in text: + currentName += text + elif not run.font.bold and (text == "and" or text == ","): + if currentName != "": + names.append(currentName) + currentName = "" + else: + names.append(currentName) + break + return names + + +def parseDocForVariableDescriptions(document, headingsToParse): + parsingEnabled = False + variableDescriptions = [] currentDescription = None for paragraph in document.paragraphs: - if (firstLastEntry[0] in paragraph.text): - firstEntryFound = True - if (firstLastEntry[1] in paragraph.text): - break - if (firstEntryFound): - if (len(paragraph.runs) > 1): - runText = paragraph.runs[0].text - isBold = paragraph.runs[0].font.bold - if (isBold): - if (currentDescription != None): - currentDescription.finishReading() - variableDescriptions[currentDescription.name] = currentDescription - # It can happen that the variable description is split over multiple - # runs. It will all be bold without spaces and can just be concatenated. - # Please don't ask me why. - runIndex = 1 - for i in range(1, len(paragraph.runs)): - isBold = paragraph.runs[i].font.bold - text = paragraph.runs[i].text - containsSpaceAtEnd = (text[-1] == " ") - if (containsSpaceAtEnd): - text = text[:-1] - containesSpaces = (text.find(" ") != -1) - if (isBold and not containesSpaces): - runText += text - else: - break - if (containsSpaceAtEnd): - break - runIndex += 1 - currentDescription = VariableDescription(runText) - # Get all the text after the variable name - for i in range(runIndex, len(paragraph.runs)): - runText = paragraph.runs[i].text - currentDescription.description += runText - continue - if (currentDescription != None): - currentDescription.description += paragraph.text + if paragraph.style.name == "Heading 1": + parsingEnabled = (paragraph.text in headingsToParse) + if not parsingEnabled: + continue + if paragraph.style.name.startswith("Heading"): + continue + + if (len(paragraph.runs) > 1): + isBold = paragraph.runs[0].font.bold + if (isBold): + if (currentDescription != None): + currentDescription.finishReading() + variableDescriptions.append(currentDescription) + + variableNames = extractVariableNamesFromRuns(paragraph.runs) + + currentDescription = VariableDescription(variableNames) + + for run in paragraph.runs: + currentDescription.description += run.text + continue + if (currentDescription != None): + currentDescription.description += paragraph.text if (currentDescription != None): currentDescription.finishReading() - variableDescriptions[currentDescription.name] = currentDescription + variableDescriptions.append(currentDescription) return variableDescriptions diff --git a/tools/standardTextToCode/readStandardTextDocxFile.py b/tools/standardTextToCode/readStandardTextDocxFile.py index cb8fa7e98..0ee9d5b01 100644 --- a/tools/standardTextToCode/readStandardTextDocxFile.py +++ b/tools/standardTextToCode/readStandardTextDocxFile.py @@ -4,18 +4,19 @@ import pickle def main(): - filename = "JVET-T2001-v2.docx" - print("Opening file " + filename) - document = Document(filename) + #filename = "JVET-T2001-v2.docx" + file = "/Users/cfeldman/Downloads/JVET-AN1019-v1/JVET-AN1019-v1_text_accepted.docx" + print("Opening file " + file) + document = Document(file) - variableDescriptions = parseDocForVariableDescriptions(document) + headingsToParse = ["Video usability information parameters", "SEI messages"] - print(f"Parsed {len(variableDescriptions)} variable descriptions: ") + variableDescriptions = parseDocForVariableDescriptions(document, headingsToParse) - # From where to where to parse. The last entry will not be included. + print(f"Parsed {len(variableDescriptions)} variable descriptions.") parsedTables = parseDocumentTables(document, variableDescriptions) - print ("Read {} classes".format(len(parsedTables))) + print ("Read {} tables".format(len(parsedTables))) # Dump everything to a file (debugging) pickle.dump( parsedTables, open( "tempPiclkle.p", "wb" ) ) diff --git a/tools/standardTextToCode/writeTablesC++.py b/tools/standardTextToCode/writeTablesC++.py index 47728be24..459c5b84e 100644 --- a/tools/standardTextToCode/writeTablesC++.py +++ b/tools/standardTextToCode/writeTablesC++.py @@ -2,6 +2,17 @@ import pickle from parseTables import * from pathlib import Path +from dataclasses import dataclass, field + + +@dataclass +class WritingSettings: + outputPath: str = "cpp" + namespace: str = "parser" + baseClass: str = "NalRBSP" + readerName: str = "SubByteReaderLogging" + includes: list[str] = field(default_factory=list) + def writeLicense(writer): writer.write( @@ -38,15 +49,18 @@ def writeLicense(writer): */\n\n""") class HeaderFile: - def __init__ (self, path, name, namespace): - self.f = open(f"{path}/{name}.h", "w") + def __init__ (self, settings: WritingSettings, name: str): + self.f = open(f"{settings.outputPath}/{name}.h", "w") writeLicense(self.f) self.f.write("#pragma once\n\n") - self.f.write("#include \"NalUnitVVC.h\"\n") - self.f.write("#include \"parser/common/SubByteReaderLogging.h\"\n\n") - self.f.write(f"""namespace {namespace}""") + for include in settings.includes: + self.f.write(f"#include \"{include}\"\n") + self.f.write("\n") + self.f.write("#include ") + self.f.write("\n") + self.f.write(f"""namespace {settings.namespace}""") self.f.write("\n{\n\n") - self.namespace = namespace + self.namespace = settings.namespace self.spaces = 0 def __del__(self): self.f.write(f"""}} // namespace {self.namespace}""") @@ -57,14 +71,14 @@ def write(self, s): self.f.write(s) class CppFile: - def __init__ (self, path, name, namespace): - self.f = open(f"{path}/{name}.cpp", "w") + def __init__ (self, settings: WritingSettings, name: str): + self.f = open(f"{settings.outputPath}/{name}.cpp", "w") writeLicense(self.f) self.f.write(f"""#include "{name}.h"\n""") self.f.write("""\n""") - self.f.write(f"""namespace {namespace}""") + self.f.write(f"""namespace {settings.namespace}""") self.f.write("\n{\n\n") - self.namespace = namespace + self.namespace = settings.namespace self.spaces = 0 def __del__(self): self.f.write(f"""}} // namespace {self.namespace}""") @@ -83,21 +97,26 @@ def argumentsToString(arguments, variableType = ""): s += f", {variableType}{arg}" return s -def writeBeginningToHeader(table, file): - file.write(f"class {table.name} : public NalRBSP\n") +def writeBeginningToHeader(table, file, readerName: str): + if table.type == TableType.NAL_UNIT: + file.write(f"class {table.name} : public NalRBSP\n") + if table.type == TableType.SEI_MESSAGE: + file.write(f"class {table.name} : public SEI\n") + else: + file.write(f"class {table.name}\n") file.write("{\n") file.write(f"public:\n") file.write(f" {table.name}() = default;\n") file.write(f" ~{table.name}() = default;\n") - file.write(f" void parse(SubByteReaderLogging &reader{argumentsToString(table.arguments, 'int')});\n\n") + file.write(f" void parse({readerName} &reader{argumentsToString(table.arguments, 'int')});\n\n") def writeEndToHeader(file): file.spaces = 0 file.write("};\n") file.write("\n") -def writeBeginnginToSource(table, file): - file.write(f"void {table.name}::parse(SubByteReaderLogging &reader)\n") +def writeBeginnginToSource(table, file, readerName: str): + file.write(f"void {table.name}::parse({readerName} &reader{argumentsToString(table.arguments, 'int')})\n") file.write("{\n") def writeEndToSource(file): @@ -112,29 +131,48 @@ def writeItemsInContainer(container, files): writeItemToFiles(item, files) files[1].spaces -= 2 +def formatCondition(condition: str): + if "byte_aligned" in condition: + return "reader.byte_aligned()" + return condition + def writeItemToFiles(item, files): header = files[0] cpp = files[1] if (type(item) == Variable): typeString = "unsigned" arguments = "" - if (item.coding.codingType == Coding.UNSIGNED_VARIABLE and item.coding.length == 0): - # Length for code should be known. Write the info and something that will not compile - header.write(f"int {item.name} {{}};\n") - cpp.write(f"""this->{item.name} = reader.readBits("{item.name}", unknown)\n""") - return - if (item.coding.codingType in [Coding.FIXED_CODE, Coding.UNSIGNED_VARIABLE, Coding.UNSIGNED_FIXED]): - if (item.coding.length == 1): + if item.coding.codingType == Coding.UNSIGNED_VARIABLE: + nrBitsText = "variable" + if item.description != None and item.description.variableParsingLength != None: + nrBitsText = item.description.variableParsingLength + typeString = "int" + parseFunction = "readBits" + arguments = f", {nrBitsText}" + if (item.coding.codingType in [Coding.FIXED_CODE, Coding.UNSIGNED_FIXED]): + if item.coding.length == 1: typeString = "bool" parseFunction = "readFlag" else: parseFunction = "readBits" arguments = f", {item.coding.length}" - elif (item.coding.codingType == Coding.UNSIGNED_EXP): + elif item.coding.codingType == Coding.UNSIGNED_EXP: parseFunction = "readUEV" - elif (item.coding.codingType == Coding.SIGNED_EXP): + elif item.coding.codingType == Coding.SIGNED_EXP: parseFunction = "readSEV" typeString = "int" + elif item.coding.codingType == Coding.BYTE: + parseFunction = "readBits" + arguments = ", 8" + elif item.coding.codingType == Coding.SIGNED_FIXED: + parseFunction = "readBitsSigned" + typeString = "int" + arguments = f", {item.coding.length}" + elif item.coding.codingType == Coding.STRING: + parseFunction = "readString" + typeString = "std::string" + elif item.coding.codingType == Coding.UNKNOWN: + parseFunction = "unknown" name = item.name if (item.arrayIndex != None): @@ -152,12 +190,17 @@ def writeItemToFiles(item, files): header.write(f"{item.functionName} {item.functionName}_instance;\n") cpp.write(f"this->{item.functionName}_instance.parse(reader{argumentsToString(item.arguments)});\n") elif (type(item) == ContainerIf): - cpp.write(f"if ({item.condition})\n") + if item.isElse: + cpp.write(f"else\n") + elif item.isElseIf: + cpp.write(f"else if ({formatCondition(item.condition)})\n") + else: + cpp.write(f"if ({formatCondition(item.condition)})\n") cpp.write("{\n") writeItemsInContainer(item, files) cpp.write("}\n") elif (type(item) == ContainerWhile): - cpp.write(f"while ({item.condition})\n") + cpp.write(f"while ({formatCondition(item.condition)})\n") cpp.write("{\n") writeItemsInContainer(item, files) cpp.write("}\n") @@ -166,7 +209,7 @@ def writeItemToFiles(item, files): cpp.write("{\n") writeItemsInContainer(item, files) cpp.write("} ") - cpp.write(f"while({item.condition})\n") + cpp.write(f"while({formatCondition(item.condition)})\n") elif (type(item) == ContainerFor): variableType = "unsigned" if ("--" in item.increment): @@ -176,27 +219,29 @@ def writeItemToFiles(item, files): writeItemsInContainer(item, files) cpp.write("}\n") -def writeTableToFiles(table, files): - writeBeginningToHeader(table, files[0]) - writeBeginnginToSource(table, files[1]) +def writeTableToFiles(table, files, readerName): + writeBeginningToHeader(table, files[0], readerName) + writeBeginnginToSource(table, files[1], readerName) writeItemsInContainer(table, files) writeEndToSource(files[1]) writeEndToHeader(files[0]) -def writeTablesToCpp(parsedTables, path): - Path(path).mkdir(parents=True, exist_ok=True) - - namespace = "parser::vvc" +def writeTablesToCpp(parsedTables, settings: WritingSettings): + Path(settings.outputPath).mkdir(parents=True, exist_ok=True) for table in parsedTables: assert(type(table) == ContainerTable) print(f"Writing {table.name}") - files = (HeaderFile(path, table.name, namespace), CppFile(path, table.name, namespace)) - writeTableToFiles(table, files) + files = (HeaderFile(settings, table.name), CppFile(settings, table.name)) + writeTableToFiles(table, files, settings.readerName) def main(): + settings = WritingSettings() + settings.includes = ["Units.h", "SubByteReaderDummy.h"] + settings.readerName = "SubByteReaderDummy" + parsedTables = pickle.load(open("tempPiclkle.p", "rb")) - writeTablesToCpp(parsedTables, "cpp") + writeTablesToCpp(parsedTables, settings) if __name__ == "__main__": main()