From d0b11af2bd445d10383049d2d7574c19a95c9006 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 1 Aug 2016 23:11:31 -0700 Subject: [PATCH] new multiline string syntax This patch also moves a bunch of the parser code into the tokenizer. Closes #162. --- doc/langref.md | 99 ++++-- doc/vim/syntax/zig.vim | 34 +- src/all_types.hpp | 53 ++- src/analyze.cpp | 118 +++---- src/ast_render.cpp | 84 +++-- src/bignum.cpp | 17 + src/bignum.hpp | 11 +- src/codegen.cpp | 28 +- src/eval.cpp | 20 +- src/parseh.cpp | 46 +-- src/parser.cpp | 593 +++----------------------------- src/tokenizer.cpp | 753 +++++++++++++++++++++++------------------ src/tokenizer.hpp | 35 +- test/run_tests.cpp | 4 +- test/self_hosted.zig | 16 +- 15 files changed, 810 insertions(+), 1101 deletions(-) diff --git a/doc/langref.md b/doc/langref.md index 595287f5f6..642431a967 100644 --- a/doc/langref.md +++ b/doc/langref.md @@ -7,27 +7,27 @@ Root = many(TopLevelDecl) "EOF" TopLevelDecl = many(Directive) option(VisibleMod) (FnDef | ExternDecl | ContainerDecl | GlobalVarDecl | ErrorValueDecl | TypeDecl | UseDecl) -TypeDecl = "type" "Symbol" "=" TypeExpr ";" +TypeDecl = "type" Symbol "=" TypeExpr ";" -ErrorValueDecl = "error" "Symbol" ";" +ErrorValueDecl = "error" Symbol ";" GlobalVarDecl = VariableDeclaration ";" -VariableDeclaration = ("var" | "const") "Symbol" option(":" TypeExpr) "=" Expression +VariableDeclaration = ("var" | "const") Symbol option(":" TypeExpr) "=" Expression -ContainerDecl = ("struct" | "enum" | "union") "Symbol" option(ParamDeclList) "{" many(StructMember) "}" +ContainerDecl = ("struct" | "enum" | "union") Symbol option(ParamDeclList) "{" many(StructMember) "}" StructMember = many(Directive) option(VisibleMod) (StructField | FnDef | GlobalVarDecl | ContainerDecl) -StructField = "Symbol" option(":" Expression) ",") +StructField = Symbol option(":" Expression) ",") UseDecl = "use" Expression ";" ExternDecl = "extern" (FnProto | VariableDeclaration) ";" -FnProto = "fn" option("Symbol") ParamDeclList option("->" TypeExpr) +FnProto = "fn" option(Symbol) ParamDeclList option("->" TypeExpr) -Directive = "#" "Symbol" "(" Expression ")" +Directive = "#" Symbol "(" Expression ")" VisibleMod = "pub" | "export" @@ -35,13 +35,13 @@ FnDef = option("inline" | "extern") FnProto Block ParamDeclList = "(" list(ParamDecl, ",") ")" -ParamDecl = option("noalias" | "inline") option("Symbol" ":") TypeExpr | "..." +ParamDecl = option("noalias" | "inline") option(Symbol ":") TypeExpr | "..." Block = "{" list(option(Statement), ";") "}" Statement = Label | VariableDeclaration ";" | Defer ";" | NonBlockExpression ";" | BlockExpression -Label = "Symbol" ":" +Label = Symbol ":" Expression = BlockExpression | NonBlockExpression @@ -49,23 +49,23 @@ TypeExpr = PrefixOpExpression NonBlockExpression = ReturnExpression | AssignmentExpression -AsmExpression = "asm" option("volatile") "(" "String" option(AsmOutput) ")" +AsmExpression = "asm" option("volatile") "(" String option(AsmOutput) ")" AsmOutput = ":" list(AsmOutputItem, ",") option(AsmInput) AsmInput = ":" list(AsmInputItem, ",") option(AsmClobbers) -AsmOutputItem = "[" "Symbol" "]" "String" "(" ("Symbol" | "->" TypeExpr) ")" +AsmOutputItem = "[" Symbol "]" String "(" (Symbol | "->" TypeExpr) ")" -AsmInputItem = "[" "Symbol" "]" "String" "(" Expression ")" +AsmInputItem = "[" Symbol "]" String "(" Expression ")" -AsmClobbers= ":" list("String", ",") +AsmClobbers= ":" list(String, ",") UnwrapExpression = BoolOrExpression (UnwrapMaybe | UnwrapError) | BoolOrExpression UnwrapMaybe = "??" Expression -UnwrapError = "%%" option("|" "Symbol" "|") Expression +UnwrapError = "%%" option("|" Symbol "|") Expression AssignmentExpression = UnwrapExpression AssignmentOperator UnwrapExpression | UnwrapExpression @@ -75,13 +75,13 @@ BlockExpression = IfExpression | Block | WhileExpression | ForExpression | Switc SwitchExpression = "switch" "(" Expression ")" "{" many(SwitchProng) "}" -SwitchProng = (list(SwitchItem, ",") | "else") "=>" option("|" "Symbol" "|") Expression "," +SwitchProng = (list(SwitchItem, ",") | "else") "=>" option("|" Symbol "|") Expression "," SwitchItem = Expression | (Expression "..." Expression) WhileExpression = "while" "(" Expression option(";" Expression) ")" Expression -ForExpression = "for" "(" Expression ")" option("|" option("*") "Symbol" option("," "Symbol") "|") Expression +ForExpression = "for" "(" Expression ")" option("|" option("*") Symbol option("," Symbol) "|") Expression BoolOrExpression = BoolAndExpression "||" BoolOrExpression | BoolAndExpression @@ -93,7 +93,7 @@ IfExpression = IfVarExpression | IfBoolExpression IfBoolExpression = "if" "(" Expression ")" Expression option(Else) -IfVarExpression = "if" "(" ("const" | "var") option("*") "Symbol" option(":" TypeExpr) "?=" Expression ")" Expression Option(Else) +IfVarExpression = "if" "(" ("const" | "var") option("*") Symbol option(":" TypeExpr) "?=" Expression ")" Expression Option(Else) Else = "else" Expression @@ -127,7 +127,7 @@ PrefixOpExpression = PrefixOp PrefixOpExpression | SuffixOpExpression SuffixOpExpression = PrimaryExpression option(FnCallExpression | ArrayAccessExpression | FieldAccessExpression | SliceExpression) -FieldAccessExpression = "." "Symbol" +FieldAccessExpression = "." Symbol FnCallExpression = "(" list(Expression, ",") ")" @@ -139,15 +139,15 @@ ContainerInitExpression = "{" ContainerInitBody "}" ContainerInitBody = list(StructLiteralField, ",") | list(Expression, ",") -StructLiteralField = "." "Symbol" "=" Expression +StructLiteralField = "." Symbol "=" Expression PrefixOp = "!" | "-" | "~" | "*" | ("&" option("const")) | "?" | "%" | "%%" | "??" | "-%" -PrimaryExpression = "Number" | "String" | "CharLiteral" | KeywordLiteral | GroupedExpression | GotoExpression | BlockExpression | "Symbol" | ("@" "Symbol" FnCallExpression) | ArrayType | (option("extern") FnProto) | AsmExpression | ("error" "." "Symbol") +PrimaryExpression = Number | String | CharLiteral | KeywordLiteral | GroupedExpression | GotoExpression | BlockExpression | Symbol | ("@" Symbol FnCallExpression) | ArrayType | (option("extern") FnProto) | AsmExpression | ("error" "." Symbol) ArrayType = "[" option(Expression) "]" option("const") TypeExpr -GotoExpression = "goto" "Symbol" +GotoExpression = "goto" Symbol GroupedExpression = "(" Expression ")" @@ -265,14 +265,13 @@ from codegen. ### Literals #### Character and String Literals + ``` Literal Example Characters Escapes Null Term Type Byte 'H' All ASCII Byte No u8 UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8 UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8 -UTF-8 Raw String r"X(hello)X" All Unicode None No [5]u8 -UTF-8 Raw C String rc"X(hello)X" All Unicode None Yes &const u8 ``` ### Escapes @@ -291,26 +290,56 @@ UTF-8 Raw C String rc"X(hello)X" All Unicode None Yes &const Note that the maximum valid Unicode point is 0x10ffff. -##### Raw Strings +##### Multiline String Literals -Raw string literals have no escapes and can span across multiple lines. To -start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('. -To end a raw string, use ')' followed by the same unique bytes, followed by '"'. +Multiline string literals have no escapes and can span across multiple lines. +To start a multiline string literal, use the `\\` token. Just like a comment, +the string literal goes until the end of the line. The end of the line is not +included in the string literal. +However, if the next line begins with `\\` then a newline is appended and +the string literal continues. -#### Numeric Literals +Example: +```zig +const hello_world_in_c = + \\#include + \\ + \\int main(int argc, char **argv) { + \\ printf("hello world\n"); + \\ return 0; + \\} +; ``` -Number literals Example Exponentiation -Decimal integer 98222 N/A -Hex integer 0xff N/A -Octal integer 0o77 N/A -Binary integer 0b11110000 N/A -Floating-point 123.0E+77 Optional -Hex floating point TODO TODO +For a multiline C string literal, prepend `c` to each `\\`. Example: + +```zig +const c_string_literal = + c\\#include + c\\ + c\\int main(int argc, char **argv) { + c\\ printf("hello world\n"); + c\\ return 0; + c\\} +; ``` +In this example the variable `c_string_literal` has type `&const char` and +has a terminating null byte. + +#### Number Literals + + Number literals | Example | Exponentiation +--------------------|-------------|-------------- + Decimal integer | 98222 | N/A + Hex integer | 0xff | N/A + Octal integer | 0o77 | N/A + Binary integer | 0b11110000 | N/A + Floating point | 123.0E+77 | Optional + Hex floating point | 0x103.70p-5 | Optional + ### Identifiers TODO diff --git a/doc/vim/syntax/zig.vim b/doc/vim/syntax/zig.vim index 17a034c50d..32852f6a60 100644 --- a/doc/vim/syntax/zig.vim +++ b/doc/vim/syntax/zig.vim @@ -1,11 +1,12 @@ " Vim syntax file " Language: Zig " Maintainer: Andrew Kelley -" Latest Revision: 28 July 2016 +" Latest Revision: 03 August 2016 if exists("b:current_syntax") finish endif +let b:current_syntax = "zig" syn keyword zigStorage const var extern export pub noalias inline noinline syn keyword zigStructure struct enum union @@ -24,33 +25,30 @@ syn keyword zigBoolean true false syn match zigOperator display "\%(+%\?\|-%\?\|/\|*%\?\|=\|\^\|&\|?\||\|!\|>\|<\|%\|<<%\?\|>>\|&&\|||\)=\?" syn match zigArrowCharacter display "->" -syn match zigDecNumber display "\<[0-9][0-9_]*\%([iu]\%(size\|8\|16\|32\|64\)\)\=" -syn match zigHexNumber display "\<0x[a-fA-F0-9_]\+\%([iu]\%(size\|8\|16\|32\|64\)\)\=" -syn match zigOctNumber display "\<0o[0-7_]\+\%([iu]\%(size\|8\|16\|32\|64\)\)\=" -syn match zigBinNumber display "\<0b[01_]\+\%([iu]\%(size\|8\|16\|32\|64\)\)\=" +syn match zigDecNumber display "\<[0-9]*\%(.[0-9]\+\)\=\%([eE][+-]\?[0-9]\+\)\=" +syn match zigHexNumber display "\<0x[a-fA-F0-9]\+\%(.[a-fA-F0-9]\+\%([pP][+-]\?[0-9]\+\)\?\)\=" +syn match zigOctNumber display "\<0o[0-7]\+" +syn match zigBinNumber display "\<0b[01]\+\%(.[01]\+\%([eE][+-]\?[0-9]\+\)\?\)\=" syn match zigCharacterInvalid display contained /b\?'\zs[\n\r\t']\ze'/ syn match zigCharacterInvalidUnicode display contained /b'\zs[^[:cntrl:][:graph:][:alnum:][:space:]]\ze'/ syn match zigCharacter /b'\([^\\]\|\\\(.\|x\x\{2}\)\)'/ contains=zigEscape,zigEscapeError,zigCharacterInvalid,zigCharacterInvalidUnicode -syn match zigCharacter /'\([^\\]\|\\\(.\|x\x\{2}\|u\x\{4}\|U\x\{8}\|u{\x\{1,6}}\)\)'/ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigCharacterInvalid - -syn match zigShebang /\%^#![^[].*/ +syn match zigCharacter /'\([^\\]\|\\\(.\|x\x\{2}\|u\x\{4}\|U\x\{6}\)\)'/ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigCharacterInvalid syn region zigCommentLine start="//" end="$" contains=zigTodo,@Spell syn region zigCommentLineDoc start="//\%(//\@!\|!\)" end="$" contains=zigTodo,@Spell +" TODO match only the first '\\' within the zigMultilineString as zigMultilineStringPrefix +syn match zigMultilineStringPrefix display contained /c\?\\\\/ +syn region zigMultilineString start="c\?\\\\" end="$" contains=zigMultilineStringPrefix + syn keyword zigTodo contained TODO XXX syn match zigEscapeError display contained /\\./ -syn match zigEscape display contained /\\\([nrt0\\'"]\|x\x\{2}\)/ -syn match zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{8}\)/ -syn match zigEscapeUnicode display contained /\\u{\x\{1,6}}/ -syn match zigStringContinuation display contained /\\\n\s*/ -syn region zigString start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ oneline contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell -syn region zigString start='r"\z([^)]*\)(' end=')\z1"' contains=@Spell - -let b:current_syntax = "zig" +syn match zigEscape display contained /\\\([nrt\\'"]\|x\x\{2}\)/ +syn match zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{6}\)/ +syn region zigString start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ oneline contains=zigEscape,zigEscapeUnicode,zigEscapeError,@Spell hi def link zigDecNumber zigNumber hi def link zigHexNumber zigNumber @@ -59,12 +57,12 @@ hi def link zigBinNumber zigNumber hi def link zigKeyword Keyword hi def link zigType Type -hi def link zigShebang Comment hi def link zigCommentLine Comment hi def link zigCommentLineDoc SpecialComment hi def link zigTodo Todo -hi def link zigStringContinuation Special hi def link zigString String +hi def link zigMultilineString String +hi def link zigMultilineStringPrefix Comment hi def link zigCharacterInvalid Error hi def link zigCharacterInvalidUnicode zigCharacterInvalid hi def link zigCharacter Character diff --git a/src/all_types.hpp b/src/all_types.hpp index 3598147700..cb8968723e 100644 --- a/src/all_types.hpp +++ b/src/all_types.hpp @@ -194,7 +194,7 @@ struct AstNodeRoot { struct AstNodeFnProto { TopLevelDecl top_level_decl; - Buf name; + Buf *name; ZigList params; AstNode *return_type; bool is_var_args; @@ -229,7 +229,7 @@ struct AstNodeFnDecl { }; struct AstNodeParamDecl { - Buf name; + Buf *name; AstNode *type; bool is_noalias; bool is_inline; @@ -279,7 +279,7 @@ struct AstNodeDefer { struct AstNodeVariableDeclaration { TopLevelDecl top_level_decl; - Buf symbol; + Buf *symbol; bool is_const; bool is_extern; // one or both of type and expr will be non null @@ -293,7 +293,7 @@ struct AstNodeVariableDeclaration { struct AstNodeTypeDecl { TopLevelDecl top_level_decl; - Buf symbol; + Buf *symbol; AstNode *child_type; // populated by semantic analyzer @@ -305,7 +305,7 @@ struct AstNodeTypeDecl { struct AstNodeErrorValueDecl { TopLevelDecl top_level_decl; - Buf name; + Buf *name; // populated by semantic analyzer ErrorTableEntry *err; @@ -434,7 +434,7 @@ struct AstNodeSliceExpr { struct AstNodeFieldAccessExpr { AstNode *struct_expr; - Buf field_name; + Buf *field_name; // populated by semantic analyzer TypeStructField *type_struct_field; @@ -448,7 +448,7 @@ struct AstNodeFieldAccessExpr { }; struct AstNodeDirective { - Buf name; + Buf *name; AstNode *expr; }; @@ -555,7 +555,7 @@ struct AstNodeSwitchRange { }; struct AstNodeLabel { - Buf name; + Buf *name; // populated by semantic analyzer Expr resolved_expr; @@ -563,7 +563,7 @@ struct AstNodeLabel { }; struct AstNodeGoto { - Buf name; + Buf *name; // populated by semantic analyzer Expr resolved_expr; @@ -571,9 +571,9 @@ struct AstNodeGoto { }; struct AsmOutput { - Buf asm_symbolic_name; - Buf constraint; - Buf variable_name; + Buf *asm_symbolic_name; + Buf *constraint; + Buf *variable_name; AstNode *return_type; // null unless "=r" and return // populated by semantic analyzer @@ -581,8 +581,8 @@ struct AsmOutput { }; struct AsmInput { - Buf asm_symbolic_name; - Buf constraint; + Buf *asm_symbolic_name; + Buf *constraint; AstNode *expr; }; @@ -593,8 +593,7 @@ struct SrcPos { struct AstNodeAsmExpr { bool is_volatile; - Buf asm_template; - ZigList offset_map; + Buf *asm_template; ZigList token_list; ZigList output_list; ZigList input_list; @@ -613,7 +612,7 @@ enum ContainerKind { struct AstNodeStructDecl { TopLevelDecl top_level_decl; - Buf name; + Buf *name; ContainerKind kind; ZigList generic_params; bool generic_params_is_var_args; // always an error but it can happen from parsing @@ -629,12 +628,12 @@ struct AstNodeStructDecl { struct AstNodeStructField { TopLevelDecl top_level_decl; - Buf name; + Buf *name; AstNode *type; }; struct AstNodeStringLiteral { - Buf buf; + Buf *buf; bool c; // populated by semantic analyzer: @@ -648,29 +647,19 @@ struct AstNodeCharLiteral { Expr resolved_expr; }; -enum NumLit { - NumLitFloat, - NumLitUInt, -}; - struct AstNodeNumberLiteral { - NumLit kind; + BigNum *bignum; // overflow is true if when parsing the number, we discovered it would not // fit without losing data in a uint64_t or double bool overflow; - union { - uint64_t x_uint; - double x_float; - } data; - // populated by semantic analyzer Expr resolved_expr; }; struct AstNodeStructValueField { - Buf name; + Buf *name; AstNode *expr; // populated by semantic analyzer @@ -706,7 +695,7 @@ struct AstNodeUndefinedLiteral { }; struct AstNodeSymbolExpr { - Buf symbol; + Buf *symbol; // populated by semantic analyzer Expr resolved_expr; diff --git a/src/analyze.cpp b/src/analyze.cpp index 6bad7fab2d..1b9d385ebc 100644 --- a/src/analyze.cpp +++ b/src/analyze.cpp @@ -1053,7 +1053,7 @@ static void resolve_function_proto(CodeGen *g, AstNode *node, FnTableEntry *fn_t if (fn_proto->top_level_decl.directives) { for (int i = 0; i < fn_proto->top_level_decl.directives->length; i += 1) { AstNode *directive_node = fn_proto->top_level_decl.directives->at(i); - Buf *name = &directive_node->data.directive.name; + Buf *name = directive_node->data.directive.name; if (buf_eql_str(name, "attribute")) { if (fn_table_entry->fn_def_node) { @@ -1251,7 +1251,7 @@ static void resolve_enum_type(CodeGen *g, ImportTableEntry *import, TypeTableEnt for (uint32_t i = 0; i < field_count; i += 1) { AstNode *field_node = decl_node->data.struct_decl.fields.at(i); TypeEnumField *type_enum_field = &enum_type->data.enumeration.fields[i]; - type_enum_field->name = &field_node->data.struct_field.name; + type_enum_field->name = field_node->data.struct_field.name; TypeTableEntry *field_type = analyze_type_expr(g, import, context, field_node->data.struct_field.type); type_enum_field->type_entry = field_type; @@ -1365,7 +1365,7 @@ static void resolve_enum_type(CodeGen *g, ImportTableEntry *import, TypeTableEnt uint64_t debug_align_in_bits = 8*LLVMABISizeOfType(g->target_data_ref, enum_type->type_ref); LLVMZigDIType *replacement_di_type = LLVMZigCreateDebugStructType(g->dbuilder, LLVMZigFileToScope(import->di_file), - buf_ptr(&decl_node->data.struct_decl.name), + buf_ptr(decl_node->data.struct_decl.name), import->di_file, decl_node->line + 1, debug_size_in_bits, debug_align_in_bits, @@ -1381,7 +1381,7 @@ static void resolve_enum_type(CodeGen *g, ImportTableEntry *import, TypeTableEnt uint64_t tag_debug_size_in_bits = 8*LLVMStoreSizeOfType(g->target_data_ref, tag_type_entry->type_ref); uint64_t tag_debug_align_in_bits = 8*LLVMABISizeOfType(g->target_data_ref, tag_type_entry->type_ref); LLVMZigDIType *tag_di_type = LLVMZigCreateDebugEnumerationType(g->dbuilder, - LLVMZigFileToScope(import->di_file), buf_ptr(&decl_node->data.struct_decl.name), + LLVMZigFileToScope(import->di_file), buf_ptr(decl_node->data.struct_decl.name), import->di_file, decl_node->line + 1, tag_debug_size_in_bits, tag_debug_align_in_bits, @@ -1441,7 +1441,7 @@ static void resolve_struct_type(CodeGen *g, ImportTableEntry *import, TypeTableE for (int i = 0; i < field_count; i += 1) { AstNode *field_node = decl_node->data.struct_decl.fields.at(i); TypeStructField *type_struct_field = &struct_type->data.structure.fields[i]; - type_struct_field->name = &field_node->data.struct_field.name; + type_struct_field->name = field_node->data.struct_field.name; TypeTableEntry *field_type = analyze_type_expr(g, import, context, field_node->data.struct_field.type); type_struct_field->type_entry = field_type; @@ -1514,7 +1514,7 @@ static void resolve_struct_type(CodeGen *g, ImportTableEntry *import, TypeTableE uint64_t debug_align_in_bits = 8*LLVMABISizeOfType(g->target_data_ref, struct_type->type_ref); LLVMZigDIType *replacement_di_type = LLVMZigCreateDebugStructType(g->dbuilder, LLVMZigFileToScope(import->di_file), - buf_ptr(&decl_node->data.struct_decl.name), + buf_ptr(decl_node->data.struct_decl.name), import->di_file, decl_node->line + 1, debug_size_in_bits, debug_align_in_bits, @@ -1570,7 +1570,7 @@ static void preview_fn_proto_instance(CodeGen *g, ImportTableEntry *import, AstN assert(!is_generic_instance || !is_generic_fn); AstNode *parent_decl = proto_node->data.fn_proto.top_level_decl.parent_decl; - Buf *proto_name = &proto_node->data.fn_proto.name; + Buf *proto_name = proto_node->data.fn_proto.name; AstNode *fn_def_node = proto_node->data.fn_proto.fn_def_node; bool is_extern = proto_node->data.fn_proto.is_extern; @@ -1645,7 +1645,7 @@ static void scan_struct_decl(CodeGen *g, ImportTableEntry *import, BlockContext return; } - Buf *name = &node->data.struct_decl.name; + Buf *name = node->data.struct_decl.name; TypeTableEntry *container_type = get_partial_container_type(g, import, context, node->data.struct_decl.kind, node, buf_ptr(name)); node->data.struct_decl.type_entry = container_type; @@ -1692,7 +1692,7 @@ static void preview_error_value_decl(CodeGen *g, AstNode *node) { ErrorTableEntry *err = allocate(1); err->decl_node = node; - buf_init_from_buf(&err->name, &node->data.error_value_decl.name); + buf_init_from_buf(&err->name, node->data.error_value_decl.name); auto existing_entry = g->error_table.maybe_get(&err->name); if (existing_entry) { @@ -1749,7 +1749,7 @@ static void resolve_top_level_decl(CodeGen *g, AstNode *node, bool pointer_only) case NodeTypeTypeDecl: { AstNode *type_node = node->data.type_decl.child_type; - Buf *decl_name = &node->data.type_decl.symbol; + Buf *decl_name = node->data.type_decl.symbol; TypeTableEntry *entry; if (node->data.type_decl.override_type) { @@ -2479,12 +2479,12 @@ static TypeTableEntry *analyze_container_init_expr(CodeGen *g, ImportTableEntry val_field_node->block_context = context; TypeStructField *type_field = find_struct_type_field(container_type, - &val_field_node->data.struct_val_field.name); + val_field_node->data.struct_val_field.name); if (!type_field) { add_node_error(g, val_field_node, buf_sprintf("no member named '%s' in '%s'", - buf_ptr(&val_field_node->data.struct_val_field.name), buf_ptr(&container_type->name))); + buf_ptr(val_field_node->data.struct_val_field.name), buf_ptr(&container_type->name))); continue; } @@ -2604,7 +2604,7 @@ static TypeTableEntry *analyze_field_access_expr(CodeGen *g, ImportTableEntry *i AstNode **struct_expr_node = &node->data.field_access_expr.struct_expr; TypeTableEntry *struct_type = analyze_expression(g, import, context, nullptr, *struct_expr_node); - Buf *field_name = &node->data.field_access_expr.field_name; + Buf *field_name = node->data.field_access_expr.field_name; bool wrapped_in_fn_call = node->data.field_access_expr.is_fn_call; @@ -2965,6 +2965,22 @@ static TypeTableEntry *resolve_expr_const_val_as_string_lit(CodeGen *g, AstNode return get_array_type(g, g->builtin_types.entry_u8, buf_len(str)); } +static TypeTableEntry *resolve_expr_const_val_as_bignum(CodeGen *g, AstNode *node, + TypeTableEntry *expected_type, BigNum *bignum, bool depends_on_compile_var) +{ + Expr *expr = get_resolved_expr(node); + expr->const_val.ok = true; + expr->const_val.depends_on_compile_var = depends_on_compile_var; + + bignum_init_bignum(&expr->const_val.data.x_bignum, bignum); + if (bignum->kind == BigNumKindInt) { + return g->builtin_types.entry_num_lit_int; + } else if (bignum->kind == BigNumKindFloat) { + return g->builtin_types.entry_num_lit_float; + } else { + zig_unreachable(); + } +} static TypeTableEntry *resolve_expr_const_val_as_unsigned_num_lit(CodeGen *g, AstNode *node, TypeTableEntry *expected_type, uint64_t x, bool depends_on_compile_var) @@ -2978,17 +2994,6 @@ static TypeTableEntry *resolve_expr_const_val_as_unsigned_num_lit(CodeGen *g, As return g->builtin_types.entry_num_lit_int; } -static TypeTableEntry *resolve_expr_const_val_as_float_num_lit(CodeGen *g, AstNode *node, - TypeTableEntry *expected_type, double x) -{ - Expr *expr = get_resolved_expr(node); - expr->const_val.ok = true; - - bignum_init_float(&expr->const_val.data.x_bignum, x); - - return g->builtin_types.entry_num_lit_float; -} - static TypeTableEntry *analyze_error_literal_expr(CodeGen *g, ImportTableEntry *import, BlockContext *context, AstNode *node, Buf *err_name) { @@ -3073,7 +3078,7 @@ static TypeTableEntry *analyze_symbol_expr(CodeGen *g, ImportTableEntry *import, return resolve_expr_const_val_as_type(g, node, node->data.symbol_expr.override_type_entry, false); } - Buf *variable_name = &node->data.symbol_expr.symbol; + Buf *variable_name = node->data.symbol_expr.symbol; auto primitive_table_entry = g->primitive_type_table.maybe_get(variable_name); if (primitive_table_entry) { @@ -3177,7 +3182,7 @@ static TypeTableEntry *analyze_lvalue(CodeGen *g, ImportTableEntry *import, Bloc return g->builtin_types.entry_invalid; } if (purpose != LValPurposeAddressOf) { - Buf *name = &lhs_node->data.symbol_expr.symbol; + Buf *name = lhs_node->data.symbol_expr.symbol; VariableTableEntry *var = find_variable(g, block_context, name); if (var) { if (var->is_const) { @@ -3742,7 +3747,7 @@ static TypeTableEntry *analyze_unwrap_error_expr(CodeGen *g, ImportTableEntry *i if (var_node) { child_context = new_block_context(node, parent_context); var_node->block_context = child_context; - Buf *var_name = &var_node->data.symbol_expr.symbol; + Buf *var_name = var_node->data.symbol_expr.symbol; node->data.unwrap_err_expr.var = add_local_var(g, var_node, import, child_context, var_name, g->builtin_types.entry_pure_error, true, nullptr); } else { @@ -3827,7 +3832,7 @@ static VariableTableEntry *analyze_variable_declaration_raw(CodeGen *g, ImportTa assert(type != nullptr); // should have been caught by the parser VariableTableEntry *var = add_local_var(g, source_node, import, context, - &variable_declaration->symbol, type, is_const, + variable_declaration->symbol, type, is_const, expr_is_maybe ? nullptr : variable_declaration->expr); variable_declaration->variable = var; @@ -3886,15 +3891,7 @@ static TypeTableEntry *analyze_number_literal_expr(CodeGen *g, ImportTableEntry return g->builtin_types.entry_invalid; } - if (node->data.number_literal.kind == NumLitUInt) { - return resolve_expr_const_val_as_unsigned_num_lit(g, node, - expected_type, node->data.number_literal.data.x_uint, false); - } else if (node->data.number_literal.kind == NumLitFloat) { - return resolve_expr_const_val_as_float_num_lit(g, node, - expected_type, node->data.number_literal.data.x_float); - } else { - zig_unreachable(); - } + return resolve_expr_const_val_as_bignum(g, node, expected_type, node->data.number_literal.bignum, false); } static TypeTableEntry *analyze_array_type(CodeGen *g, ImportTableEntry *import, BlockContext *context, @@ -4034,13 +4031,13 @@ static TypeTableEntry *analyze_for_expr(CodeGen *g, ImportTableEntry *import, Bl AstNode *elem_var_node = node->data.for_expr.elem_node; elem_var_node->block_context = child_context; - Buf *elem_var_name = &elem_var_node->data.symbol_expr.symbol; + Buf *elem_var_name = elem_var_node->data.symbol_expr.symbol; node->data.for_expr.elem_var = add_local_var(g, elem_var_node, import, child_context, elem_var_name, var_type, true, nullptr); AstNode *index_var_node = node->data.for_expr.index_node; if (index_var_node) { - Buf *index_var_name = &index_var_node->data.symbol_expr.symbol; + Buf *index_var_name = index_var_node->data.symbol_expr.symbol; index_var_node->block_context = child_context; node->data.for_expr.index_var = add_local_var(g, index_var_node, import, child_context, index_var_name, g->builtin_types.entry_usize, true, nullptr); @@ -4952,7 +4949,7 @@ static TypeTableEntry *analyze_builtin_fn_call_expr(CodeGen *g, ImportTableEntry assert(node->type == NodeTypeFnCallExpr); AstNode *fn_ref_expr = node->data.fn_call_expr.fn_ref_expr; - Buf *name = &fn_ref_expr->data.symbol_expr.symbol; + Buf *name = fn_ref_expr->data.symbol_expr.symbol; auto entry = g->builtin_fn_table.maybe_get(name); @@ -5476,7 +5473,7 @@ static TypeTableEntry *analyze_fn_call_with_inline_args(CodeGen *g, ImportTableE ConstExprValue *const_val = &get_resolved_expr(*param_node)->const_val; if (const_val->ok) { VariableTableEntry *var = add_local_var(g, generic_param_decl_node, decl_node->owner, child_context, - &generic_param_decl_node->data.param_decl.name, param_type, true, *param_node); + generic_param_decl_node->data.param_decl.name, param_type, true, *param_node); // This generic function instance could be called with anything, so when this variable is read it // needs to know that it depends on compile time variable data. var->force_depends_on_compile_var = true; @@ -5570,7 +5567,7 @@ static TypeTableEntry *analyze_generic_fn_call(CodeGen *g, ImportTableEntry *imp ConstExprValue *const_val = &get_resolved_expr(*param_node)->const_val; if (const_val->ok) { VariableTableEntry *var = add_local_var(g, generic_param_decl_node, decl_node->owner, child_context, - &generic_param_decl_node->data.param_decl.name, param_type, true, *param_node); + generic_param_decl_node->data.param_decl.name, param_type, true, *param_node); var->force_depends_on_compile_var = true; } else { add_node_error(g, *param_node, buf_sprintf("unable to evaluate constant expression")); @@ -5964,7 +5961,7 @@ static TypeTableEntry *analyze_switch_expr(CodeGen *g, ImportTableEntry *import, if (expr_type->id == TypeTableEntryIdEnum) { if (item_node->type == NodeTypeSymbol) { - Buf *field_name = &item_node->data.symbol_expr.symbol; + Buf *field_name = item_node->data.symbol_expr.symbol; TypeEnumField *type_enum_field = get_enum_field(expr_type, field_name); if (type_enum_field) { item_node->data.symbol_expr.enum_field = type_enum_field; @@ -6000,7 +5997,7 @@ static TypeTableEntry *analyze_switch_expr(CodeGen *g, ImportTableEntry *import, } } else if (expr_type->id == TypeTableEntryIdErrorUnion) { if (item_node->type == NodeTypeSymbol) { - Buf *err_name = &item_node->data.symbol_expr.symbol; + Buf *err_name = item_node->data.symbol_expr.symbol; bool is_ok_case = buf_eql_str(err_name, "Ok"); auto err_table_entry = is_ok_case ? nullptr: g->error_table.maybe_get(err_name); if (is_ok_case || err_table_entry) { @@ -6072,7 +6069,7 @@ static TypeTableEntry *analyze_switch_expr(CodeGen *g, ImportTableEntry *import, AstNode *var_node = prong_node->data.switch_prong.var_symbol; if (var_node) { assert(var_node->type == NodeTypeSymbol); - Buf *var_name = &var_node->data.symbol_expr.symbol; + Buf *var_name = var_node->data.symbol_expr.symbol; var_node->block_context = child_context; prong_node->data.switch_prong.var = add_local_var(g, var_node, import, child_context, var_name, var_type, true, nullptr); @@ -6228,9 +6225,9 @@ static TypeTableEntry *analyze_string_literal_expr(CodeGen *g, ImportTableEntry TypeTableEntry *expected_type, AstNode *node) { if (node->data.string_literal.c) { - return resolve_expr_const_val_as_c_string_lit(g, node, &node->data.string_literal.buf); + return resolve_expr_const_val_as_c_string_lit(g, node, node->data.string_literal.buf); } else { - return resolve_expr_const_val_as_string_lit(g, node, &node->data.string_literal.buf); + return resolve_expr_const_val_as_string_lit(g, node, node->data.string_literal.buf); } } @@ -6255,7 +6252,7 @@ static TypeTableEntry *analyze_block_expr(CodeGen *g, ImportTableEntry *import, child->data.label.label_entry = label; fn_table_entry->all_labels.append(label); - child_context->label_table.put(&child->data.label.name, label); + child_context->label_table.put(child->data.label.name, label); return_type = g->builtin_types.entry_void; continue; @@ -6316,7 +6313,7 @@ static TypeTableEntry *analyze_asm_expr(CodeGen *g, ImportTableEntry *import, Bl break; } } else { - Buf *variable_name = &asm_output->variable_name; + Buf *variable_name = asm_output->variable_name; VariableTableEntry *var = find_variable(g, context, variable_name); if (var) { asm_output->variable = var; @@ -6351,7 +6348,7 @@ static TypeTableEntry *analyze_goto_pass1(CodeGen *g, ImportTableEntry *import, static void analyze_goto_pass2(CodeGen *g, ImportTableEntry *import, AstNode *node) { assert(node->type == NodeTypeGoto); - Buf *label_name = &node->data.goto_expr.name; + Buf *label_name = node->data.goto_expr.name; BlockContext *context = node->block_context; assert(context); LabelTableEntry *label = find_label(g, context, label_name); @@ -6549,11 +6546,11 @@ static void analyze_fn_body(CodeGen *g, FnTableEntry *fn_table_entry) { buf_sprintf("byvalue struct parameters not yet supported on extern functions")); } - if (buf_len(¶m_decl->name) == 0) { + if (buf_len(param_decl->name) == 0) { add_node_error(g, param_decl_node, buf_sprintf("missing parameter name")); } - VariableTableEntry *var = add_local_var(g, param_decl_node, import, context, ¶m_decl->name, + VariableTableEntry *var = add_local_var(g, param_decl_node, import, context, param_decl->name, type, true, nullptr); var->src_arg_index = i; param_decl_node->data.param_decl.variable = var; @@ -6583,7 +6580,7 @@ static void analyze_fn_body(CodeGen *g, FnTableEntry *fn_table_entry) { if (!label->used) { add_node_error(g, label->decl_node, buf_sprintf("label '%s' defined but not used", - buf_ptr(&label->decl_node->data.label.name))); + buf_ptr(label->decl_node->data.label.name))); } } @@ -6640,7 +6637,7 @@ static void scan_decls(CodeGen *g, ImportTableEntry *import, BlockContext *conte break; case NodeTypeContainerDecl: { - Buf *name = &node->data.struct_decl.name; + Buf *name = node->data.struct_decl.name; add_top_level_decl(g, import, context, node, name); if (node->data.struct_decl.generic_params.length == 0) { scan_struct_decl(g, import, context, node); @@ -6653,20 +6650,20 @@ static void scan_decls(CodeGen *g, ImportTableEntry *import, BlockContext *conte break; case NodeTypeVariableDeclaration: { - Buf *name = &node->data.variable_declaration.symbol; + Buf *name = node->data.variable_declaration.symbol; add_top_level_decl(g, import, context, node, name); break; } case NodeTypeTypeDecl: { - Buf *name = &node->data.type_decl.symbol; + Buf *name = node->data.type_decl.symbol; add_top_level_decl(g, import, context, node, name); break; } case NodeTypeFnProto: { // if the name is missing, we immediately announce an error - Buf *fn_name = &node->data.fn_proto.name; + Buf *fn_name = node->data.fn_proto.name; if (buf_len(fn_name) == 0) { node->data.fn_proto.skip = true; add_node_error(g, node, buf_sprintf("missing function name")); @@ -6851,6 +6848,9 @@ ImportTableEntry *add_source_file(CodeGen *g, PackageTableEntry *package, assert(import_entry->root); if (g->verbose) { ast_print(stderr, import_entry->root, 0); + //fprintf(stderr, "\nReformatted Source:\n"); + //fprintf(stderr, "---------------------\n"); + //ast_render(stderr, import_entry->root, 4); } import_entry->di_file = LLVMZigCreateFile(g->dbuilder, buf_ptr(src_basename), buf_ptr(src_dirname)); @@ -6868,7 +6868,7 @@ ImportTableEntry *add_source_file(CodeGen *g, PackageTableEntry *package, if (top_level_decl->type == NodeTypeFnDef) { AstNode *proto_node = top_level_decl->data.fn_def.fn_proto; assert(proto_node->type == NodeTypeFnProto); - Buf *proto_name = &proto_node->data.fn_proto.name; + Buf *proto_name = proto_node->data.fn_proto.name; bool is_private = (proto_node->data.fn_proto.top_level_decl.visib_mod == VisibModPrivate); @@ -7064,7 +7064,7 @@ bool is_node_void_expr(AstNode *node) { { AstNode *type_node = node->data.container_init_expr.type; if (type_node->type == NodeTypeSymbol && - buf_eql_str(&type_node->data.symbol_expr.symbol, "void")) + buf_eql_str(type_node->data.symbol_expr.symbol, "void")) { return true; } diff --git a/src/ast_render.cpp b/src/ast_render.cpp index c1d50ddfff..42edde8a6c 100644 --- a/src/ast_render.cpp +++ b/src/ast_render.cpp @@ -78,6 +78,24 @@ static const char *visib_mod_string(VisibMod mod) { zig_unreachable(); } +static const char *return_string(ReturnKind kind) { + switch (kind) { + case ReturnKindUnconditional: return "return"; + case ReturnKindError: return "%return"; + case ReturnKindMaybe: return "?return"; + } + zig_unreachable(); +} + +static const char *defer_string(ReturnKind kind) { + switch (kind) { + case ReturnKindUnconditional: return "defer"; + case ReturnKindError: return "%defer"; + case ReturnKindMaybe: return "?defer"; + } + zig_unreachable(); +} + static const char *extern_string(bool is_extern) { return is_extern ? "extern " : ""; } @@ -243,7 +261,7 @@ static bool is_node_void(AstNode *node) { if (node->type == NodeTypeSymbol) { if (node->data.symbol_expr.override_type_entry) { return node->data.symbol_expr.override_type_entry->id == TypeTableEntryIdVoid; - } else if (buf_eql_str(&node->data.symbol_expr.symbol, "void")) { + } else if (buf_eql_str(node->data.symbol_expr.symbol, "void")) { return true; } } @@ -260,7 +278,12 @@ static bool is_digit(uint8_t c) { } static bool is_printable(uint8_t c) { - return is_alpha_under(c) || is_digit(c) || c == ' '; + static const uint8_t printables[] = + " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.~`!@#$%^&*()_-+=\\{}[];'\"?/<>,"; + for (size_t i = 0; i < array_length(printables); i += 1) { + if (c == printables[i]) return true; + } + return false; } static void string_literal_escape(Buf *source, Buf *dest) { @@ -353,18 +376,18 @@ static void render_node(AstRender *ar, AstNode *node) { const char *extern_str = extern_string(node->data.fn_proto.is_extern); const char *inline_str = inline_string(node->data.fn_proto.is_inline); fprintf(ar->f, "%s%s%sfn ", pub_str, inline_str, extern_str); - print_symbol(ar, &node->data.fn_proto.name); + print_symbol(ar, node->data.fn_proto.name); fprintf(ar->f, "("); int arg_count = node->data.fn_proto.params.length; bool is_var_args = node->data.fn_proto.is_var_args; for (int arg_i = 0; arg_i < arg_count; arg_i += 1) { AstNode *param_decl = node->data.fn_proto.params.at(arg_i); assert(param_decl->type == NodeTypeParamDecl); - if (buf_len(¶m_decl->data.param_decl.name) > 0) { + if (buf_len(param_decl->data.param_decl.name) > 0) { const char *noalias_str = param_decl->data.param_decl.is_noalias ? "noalias " : ""; const char *inline_str = param_decl->data.param_decl.is_inline ? "inline " : ""; fprintf(ar->f, "%s%s", noalias_str, inline_str); - print_symbol(ar, ¶m_decl->data.param_decl.name); + print_symbol(ar, param_decl->data.param_decl.name); fprintf(ar->f, ": "); } render_node(ar, param_decl->data.param_decl.type); @@ -417,21 +440,31 @@ static void render_node(AstRender *ar, AstNode *node) { fprintf(ar->f, "}"); break; case NodeTypeDirective: - fprintf(ar->f, "#%s(", buf_ptr(&node->data.directive.name)); + fprintf(ar->f, "#%s(", buf_ptr(node->data.directive.name)); render_node(ar, node->data.directive.expr); fprintf(ar->f, ")\n"); break; case NodeTypeReturnExpr: - zig_panic("TODO"); + { + const char *return_str = return_string(node->data.return_expr.kind); + fprintf(ar->f, "%s ", return_str); + render_node(ar, node->data.return_expr.expr); + break; + } case NodeTypeDefer: - zig_panic("TODO"); + { + const char *defer_str = defer_string(node->data.defer.kind); + fprintf(ar->f, "%s ", defer_str); + render_node(ar, node->data.return_expr.expr); + break; + } case NodeTypeVariableDeclaration: { const char *pub_str = visib_mod_string(node->data.variable_declaration.top_level_decl.visib_mod); const char *extern_str = extern_string(node->data.variable_declaration.is_extern); const char *const_or_var = const_or_var_string(node->data.variable_declaration.is_const); fprintf(ar->f, "%s%s%s ", pub_str, extern_str, const_or_var); - print_symbol(ar, &node->data.variable_declaration.symbol); + print_symbol(ar, node->data.variable_declaration.symbol); if (node->data.variable_declaration.type) { fprintf(ar->f, ": "); @@ -446,7 +479,7 @@ static void render_node(AstRender *ar, AstNode *node) { case NodeTypeTypeDecl: { const char *pub_str = visib_mod_string(node->data.type_decl.top_level_decl.visib_mod); - const char *var_name = buf_ptr(&node->data.type_decl.symbol); + const char *var_name = buf_ptr(node->data.type_decl.symbol); fprintf(ar->f, "%stype %s = ", pub_str, var_name); render_node(ar, node->data.type_decl.child_type); break; @@ -463,12 +496,15 @@ static void render_node(AstRender *ar, AstNode *node) { case NodeTypeUnwrapErrorExpr: zig_panic("TODO"); case NodeTypeNumberLiteral: - switch (node->data.number_literal.kind) { - case NumLitUInt: - fprintf(ar->f, "%" PRIu64, node->data.number_literal.data.x_uint); + switch (node->data.number_literal.bignum->kind) { + case BigNumKindInt: + { + const char *negative_str = node->data.number_literal.bignum->is_negative ? "-" : ""; + fprintf(ar->f, "%s%llu", negative_str, node->data.number_literal.bignum->data.x_uint); + } break; - case NumLitFloat: - fprintf(ar->f, "%f", node->data.number_literal.data.x_float); + case BigNumKindFloat: + fprintf(ar->f, "%f", node->data.number_literal.bignum->data.x_float); break; } break; @@ -478,7 +514,7 @@ static void render_node(AstRender *ar, AstNode *node) { fprintf(ar->f, "c"); } Buf tmp_buf = BUF_INIT; - string_literal_escape(&node->data.string_literal.buf, &tmp_buf); + string_literal_escape(node->data.string_literal.buf, &tmp_buf); fprintf(ar->f, "\"%s\"", buf_ptr(&tmp_buf)); } break; @@ -498,7 +534,7 @@ static void render_node(AstRender *ar, AstNode *node) { if (override_type) { fprintf(ar->f, "%s", buf_ptr(&override_type->name)); } else { - fprintf(ar->f, "%s", buf_ptr(&node->data.symbol_expr.symbol)); + print_symbol(ar, node->data.symbol_expr.symbol); } } break; @@ -513,10 +549,14 @@ static void render_node(AstRender *ar, AstNode *node) { case NodeTypeFnCallExpr: if (node->data.fn_call_expr.is_builtin) { fprintf(ar->f, "@"); + } else { + fprintf(ar->f, "("); + } + render_node(ar, node->data.fn_call_expr.fn_ref_expr); + if (!node->data.fn_call_expr.is_builtin) { + fprintf(ar->f, ")"); } fprintf(ar->f, "("); - render_node(ar, node->data.fn_call_expr.fn_ref_expr); - fprintf(ar->f, ")("); for (int i = 0; i < node->data.fn_call_expr.params.length; i += 1) { AstNode *param = node->data.fn_call_expr.params.at(i); if (i != 0) { @@ -537,7 +577,7 @@ static void render_node(AstRender *ar, AstNode *node) { case NodeTypeFieldAccessExpr: { AstNode *lhs = node->data.field_access_expr.struct_expr; - Buf *rhs = &node->data.field_access_expr.field_name; + Buf *rhs = node->data.field_access_expr.field_name; render_node(ar, lhs); fprintf(ar->f, "."); print_symbol(ar, rhs); @@ -577,7 +617,7 @@ static void render_node(AstRender *ar, AstNode *node) { zig_panic("TODO"); case NodeTypeContainerDecl: { - const char *struct_name = buf_ptr(&node->data.struct_decl.name); + const char *struct_name = buf_ptr(node->data.struct_decl.name); const char *pub_str = visib_mod_string(node->data.struct_decl.top_level_decl.visib_mod); const char *container_str = container_string(node->data.struct_decl.kind); fprintf(ar->f, "%s%s %s {\n", pub_str, container_str, struct_name); @@ -586,7 +626,7 @@ static void render_node(AstRender *ar, AstNode *node) { AstNode *field_node = node->data.struct_decl.fields.at(field_i); assert(field_node->type == NodeTypeStructField); print_indent(ar); - print_symbol(ar, &field_node->data.struct_field.name); + print_symbol(ar, field_node->data.struct_field.name); if (!is_node_void(field_node->data.struct_field.type)) { fprintf(ar->f, ": "); render_node(ar, field_node->data.struct_field.type); diff --git a/src/bignum.cpp b/src/bignum.cpp index 1e90b201b0..7cbb880848 100644 --- a/src/bignum.cpp +++ b/src/bignum.cpp @@ -6,6 +6,7 @@ */ #include "bignum.hpp" +#include "buffer.hpp" #include #include @@ -41,6 +42,10 @@ void bignum_init_signed(BigNum *dest, int64_t x) { } } +void bignum_init_bignum(BigNum *dest, BigNum *src) { + memcpy(dest, src, sizeof(BigNum)); +} + bool bignum_fits_in_bits(BigNum *bn, int bit_count, bool is_signed) { assert(bn->kind == BigNumKindInt); @@ -343,3 +348,15 @@ bool bignum_cmp_gte(BigNum *op1, BigNum *op2) { return true; } } + +bool bignum_increment_by_scalar(BigNum *bignum, uint64_t scalar) { + assert(bignum->kind == BigNumKindInt); + assert(!bignum->is_negative); + return __builtin_uaddll_overflow(bignum->data.x_uint, scalar, &bignum->data.x_uint); +} + +bool bignum_multiply_by_scalar(BigNum *bignum, uint64_t scalar) { + assert(bignum->kind == BigNumKindInt); + assert(!bignum->is_negative); + return __builtin_umulll_overflow(bignum->data.x_uint, scalar, &bignum->data.x_uint); +} diff --git a/src/bignum.hpp b/src/bignum.hpp index ac1f75e791..570f6fe44b 100644 --- a/src/bignum.hpp +++ b/src/bignum.hpp @@ -5,7 +5,8 @@ * See http://opensource.org/licenses/MIT */ -#include "buffer.hpp" +#ifndef ZIG_BIGNUM_HPP +#define ZIG_BIGNUM_HPP #include @@ -26,6 +27,7 @@ struct BigNum { void bignum_init_float(BigNum *dest, double x); void bignum_init_unsigned(BigNum *dest, uint64_t x); void bignum_init_signed(BigNum *dest, int64_t x); +void bignum_init_bignum(BigNum *dest, BigNum *src); bool bignum_fits_in_bits(BigNum *bn, int bit_count, bool is_signed); uint64_t bignum_to_twos_complement(BigNum *bn); @@ -57,4 +59,11 @@ bool bignum_cmp_gt(BigNum *op1, BigNum *op2); bool bignum_cmp_lte(BigNum *op1, BigNum *op2); bool bignum_cmp_gte(BigNum *op1, BigNum *op2); +// helper functions +bool bignum_increment_by_scalar(BigNum *bignum, uint64_t scalar); +bool bignum_multiply_by_scalar(BigNum *bignum, uint64_t scalar); + +struct Buf; Buf *bignum_to_buf(BigNum *bn); + +#endif diff --git a/src/codegen.cpp b/src/codegen.cpp index 718e909b06..990b7d135e 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1431,7 +1431,7 @@ static LLVMValueRef gen_field_access_expr(CodeGen *g, AstNode *node, bool is_lva TypeTableEntry *struct_type = get_expr_type(struct_expr); if (struct_type->id == TypeTableEntryIdArray) { - Buf *name = &node->data.field_access_expr.field_name; + Buf *name = node->data.field_access_expr.field_name; assert(buf_eql_str(name, "len")); return LLVMConstInt(g->builtin_types.entry_usize->type_ref, struct_type->data.array.len, false); @@ -2726,18 +2726,18 @@ static LLVMValueRef gen_block(CodeGen *g, AstNode *block_node, TypeTableEntry *i } static int find_asm_index(CodeGen *g, AstNode *node, AsmToken *tok) { - const char *ptr = buf_ptr(&node->data.asm_expr.asm_template) + tok->start + 2; + const char *ptr = buf_ptr(node->data.asm_expr.asm_template) + tok->start + 2; int len = tok->end - tok->start - 2; int result = 0; for (int i = 0; i < node->data.asm_expr.output_list.length; i += 1, result += 1) { AsmOutput *asm_output = node->data.asm_expr.output_list.at(i); - if (buf_eql_mem(&asm_output->asm_symbolic_name, ptr, len)) { + if (buf_eql_mem(asm_output->asm_symbolic_name, ptr, len)) { return result; } } for (int i = 0; i < node->data.asm_expr.input_list.length; i += 1, result += 1) { AsmInput *asm_input = node->data.asm_expr.input_list.at(i); - if (buf_eql_mem(&asm_input->asm_symbolic_name, ptr, len)) { + if (buf_eql_mem(asm_input->asm_symbolic_name, ptr, len)) { return result; } } @@ -2749,7 +2749,7 @@ static LLVMValueRef gen_asm_expr(CodeGen *g, AstNode *node) { AstNodeAsmExpr *asm_expr = &node->data.asm_expr; - Buf *src_template = &asm_expr->asm_template; + Buf *src_template = asm_expr->asm_template; Buf llvm_template = BUF_INIT; buf_resize(&llvm_template, 0); @@ -2796,11 +2796,11 @@ static LLVMValueRef gen_asm_expr(CodeGen *g, AstNode *node) { for (int i = 0; i < asm_expr->output_list.length; i += 1, total_index += 1) { AsmOutput *asm_output = asm_expr->output_list.at(i); bool is_return = (asm_output->return_type != nullptr); - assert(*buf_ptr(&asm_output->constraint) == '='); + assert(*buf_ptr(asm_output->constraint) == '='); if (is_return) { - buf_appendf(&constraint_buf, "=%s", buf_ptr(&asm_output->constraint) + 1); + buf_appendf(&constraint_buf, "=%s", buf_ptr(asm_output->constraint) + 1); } else { - buf_appendf(&constraint_buf, "=*%s", buf_ptr(&asm_output->constraint) + 1); + buf_appendf(&constraint_buf, "=*%s", buf_ptr(asm_output->constraint) + 1); } if (total_index + 1 < total_constraint_count) { buf_append_char(&constraint_buf, ','); @@ -2816,7 +2816,7 @@ static LLVMValueRef gen_asm_expr(CodeGen *g, AstNode *node) { } for (int i = 0; i < asm_expr->input_list.length; i += 1, total_index += 1, param_index += 1) { AsmInput *asm_input = asm_expr->input_list.at(i); - buf_append_buf(&constraint_buf, &asm_input->constraint); + buf_append_buf(&constraint_buf, asm_input->constraint); if (total_index + 1 < total_constraint_count) { buf_append_char(&constraint_buf, ','); } @@ -2885,7 +2885,7 @@ static LLVMValueRef gen_container_init_expr(CodeGen *g, AstNode *node) { if (type_struct_field->type_entry->id == TypeTableEntryIdVoid) { continue; } - assert(buf_eql_buf(type_struct_field->name, &field_node->data.struct_val_field.name)); + assert(buf_eql_buf(type_struct_field->name, field_node->data.struct_val_field.name)); set_debug_source_node(g, field_node); LLVMValueRef field_ptr = LLVMBuildStructGEP(g->builder, tmp_struct_ptr, type_struct_field->gen_index, ""); @@ -3853,7 +3853,7 @@ static void generate_error_name_table(CodeGen *g) { for (int i = 1; i < g->error_decls.length; i += 1) { AstNode *error_decl_node = g->error_decls.at(i); assert(error_decl_node->type == NodeTypeErrorValueDecl); - Buf *name = &error_decl_node->data.error_value_decl.name; + Buf *name = error_decl_node->data.error_value_decl.name; LLVMValueRef str_init = LLVMConstString(buf_ptr(name), buf_len(name), true); LLVMValueRef str_global = LLVMAddGlobal(g->module, LLVMTypeOf(str_init), ""); @@ -3882,7 +3882,7 @@ static void build_label_blocks(CodeGen *g, FnTableEntry *fn) { LLVMBasicBlockRef entry_block = LLVMAppendBasicBlock(fn->fn_value, "entry"); for (int i = 0; i < fn->all_labels.length; i += 1) { LabelTableEntry *label = fn->all_labels.at(i); - Buf *name = &label->decl_node->data.label.name; + Buf *name = label->decl_node->data.label.name; label->basic_block = LLVMAppendBasicBlock(fn->fn_value, buf_ptr(name)); } LLVMPositionBuilderAtEnd(g->builder, entry_block); @@ -4951,7 +4951,7 @@ void codegen_generate_h_file(CodeGen *g) { buf_appendf(&h_buf, "%s %s %s(", buf_ptr(export_macro), buf_ptr(&return_type_c), - buf_ptr(&fn_proto->name)); + buf_ptr(fn_proto->name)); Buf param_type_c = BUF_INIT; if (fn_proto->params.length) { @@ -4961,7 +4961,7 @@ void codegen_generate_h_file(CodeGen *g) { to_c_type(g, param_type, ¶m_type_c); buf_appendf(&h_buf, "%s %s", buf_ptr(¶m_type_c), - buf_ptr(¶m_decl_node->data.param_decl.name)); + buf_ptr(param_decl_node->data.param_decl.name)); if (param_i < fn_proto->params.length - 1) buf_appendf(&h_buf, ", "); } diff --git a/src/eval.cpp b/src/eval.cpp index a8cecc47b1..b235040b87 100644 --- a/src/eval.cpp +++ b/src/eval.cpp @@ -427,7 +427,7 @@ static EvalVar *find_var(EvalFn *ef, Buf *name) { static bool eval_symbol_expr(EvalFn *ef, AstNode *node, ConstExprValue *out_val) { assert(node->type == NodeTypeSymbol); - Buf *name = &node->data.symbol_expr.symbol; + Buf *name = node->data.symbol_expr.symbol; EvalVar *var = find_var(ef, name); assert(var); @@ -924,7 +924,7 @@ static bool eval_field_access_expr(EvalFn *ef, AstNode *node, ConstExprValue *ou TypeTableEntry *struct_type = get_resolved_expr(struct_expr)->type_entry; if (struct_type->id == TypeTableEntryIdArray) { - Buf *name = &node->data.field_access_expr.field_name; + Buf *name = node->data.field_access_expr.field_name; assert(buf_eql_str(name, "len")); zig_panic("TODO"); } else if (struct_type->id == TypeTableEntryIdStruct || (struct_type->id == TypeTableEntryIdPointer && @@ -971,7 +971,7 @@ static bool eval_for_expr(EvalFn *ef, AstNode *node, ConstExprValue *out_val) { if (eval_expr(ef, array_node, &array_val)) return true; assert(elem_node->type == NodeTypeSymbol); - Buf *elem_var_name = &elem_node->data.symbol_expr.symbol; + Buf *elem_var_name = elem_node->data.symbol_expr.symbol; if (node->data.for_expr.elem_is_ptr) { zig_panic("TODO"); @@ -980,7 +980,7 @@ static bool eval_for_expr(EvalFn *ef, AstNode *node, ConstExprValue *out_val) { Buf *index_var_name = nullptr; if (index_node) { assert(index_node->type == NodeTypeSymbol); - index_var_name = &index_node->data.symbol_expr.symbol; + index_var_name = index_node->data.symbol_expr.symbol; } uint64_t it_index = 0; @@ -1164,7 +1164,7 @@ static bool eval_var_decl_expr(EvalFn *ef, AstNode *node, ConstExprValue *out_va my_scope->vars.add_one(); EvalVar *var = &my_scope->vars.last(); - var->name = &node->data.variable_declaration.symbol; + var->name = node->data.variable_declaration.symbol; if (eval_expr(ef, node->data.variable_declaration.expr, &var->value)) return true; @@ -1178,13 +1178,7 @@ static bool eval_number_literal_expr(EvalFn *ef, AstNode *node, ConstExprValue * assert(!node->data.number_literal.overflow); out_val->ok = true; - if (node->data.number_literal.kind == NumLitUInt) { - bignum_init_unsigned(&out_val->data.x_bignum, node->data.number_literal.data.x_uint); - } else if (node->data.number_literal.kind == NumLitFloat) { - bignum_init_float(&out_val->data.x_bignum, node->data.number_literal.data.x_float); - } else { - zig_unreachable(); - } + bignum_init_bignum(&out_val->data.x_bignum, node->data.number_literal.bignum); return false; } @@ -1339,7 +1333,7 @@ static bool eval_fn_args(EvalFnRoot *efr, FnTableEntry *fn, ConstExprValue *args root_scope->vars.add_one(); EvalVar *eval_var = &root_scope->vars.last(); - eval_var->name = &decl_param_node->data.param_decl.name; + eval_var->name = decl_param_node->data.param_decl.name; eval_var->value = *src_const_val; } diff --git a/src/parseh.cpp b/src/parseh.cpp index 812807684b..ae91b143aa 100644 --- a/src/parseh.cpp +++ b/src/parseh.cpp @@ -104,14 +104,14 @@ static AstNode *create_node(Context *c, NodeType type) { static AstNode *create_symbol_node(Context *c, const char *type_name) { AstNode *node = create_node(c, NodeTypeSymbol); - buf_init_from_str(&node->data.symbol_expr.symbol, type_name); + node->data.symbol_expr.symbol = buf_create_from_str(type_name); return node; } static AstNode *create_field_access_node(Context *c, const char *lhs, const char *rhs) { AstNode *node = create_node(c, NodeTypeFieldAccessExpr); node->data.field_access_expr.struct_expr = create_symbol_node(c, lhs); - buf_init_from_str(&node->data.field_access_expr.field_name, rhs); + node->data.field_access_expr.field_name = buf_create_from_str(rhs); normalize_parent_ptrs(node); return node; } @@ -120,7 +120,7 @@ static AstNode *create_typed_var_decl_node(Context *c, bool is_const, const char AstNode *type_node, AstNode *init_node) { AstNode *node = create_node(c, NodeTypeVariableDeclaration); - buf_init_from_str(&node->data.variable_declaration.symbol, var_name); + node->data.variable_declaration.symbol = buf_create_from_str(var_name); node->data.variable_declaration.is_const = is_const; node->data.variable_declaration.top_level_decl.visib_mod = c->visib_mod; node->data.variable_declaration.expr = init_node; @@ -146,7 +146,7 @@ static AstNode *create_prefix_node(Context *c, PrefixOp op, AstNode *child_node) static AstNode *create_struct_field_node(Context *c, const char *name, AstNode *type_node) { assert(type_node); AstNode *node = create_node(c, NodeTypeStructField); - buf_init_from_str(&node->data.struct_field.name, name); + node->data.struct_field.name = buf_create_from_str(name); node->data.struct_field.top_level_decl.visib_mod = VisibModPub; node->data.struct_field.type = type_node; @@ -157,7 +157,7 @@ static AstNode *create_struct_field_node(Context *c, const char *name, AstNode * static AstNode *create_param_decl_node(Context *c, const char *name, AstNode *type_node, bool is_noalias) { assert(type_node); AstNode *node = create_node(c, NodeTypeParamDecl); - buf_init_from_str(&node->data.param_decl.name, name); + node->data.param_decl.name = buf_create_from_str(name); node->data.param_decl.type = type_node; node->data.param_decl.is_noalias = is_noalias; @@ -171,17 +171,18 @@ static AstNode *create_char_lit_node(Context *c, uint8_t value) { return node; } +// accepts ownership of buf static AstNode *create_str_lit_node(Context *c, Buf *buf) { AstNode *node = create_node(c, NodeTypeStringLiteral); - buf_init_from_buf(&node->data.string_literal.buf, buf); + node->data.string_literal.buf = buf; node->data.string_literal.c = true; return node; } static AstNode *create_num_lit_float(Context *c, double x) { AstNode *node = create_node(c, NodeTypeNumberLiteral); - node->data.number_literal.kind = NumLitFloat; - node->data.number_literal.data.x_float = x; + node->data.number_literal.bignum = allocate_nonzero(1); + bignum_init_float(node->data.number_literal.bignum, x); return node; } @@ -193,8 +194,8 @@ static AstNode *create_num_lit_float_negative(Context *c, double x, bool negativ static AstNode *create_num_lit_unsigned(Context *c, uint64_t x) { AstNode *node = create_node(c, NodeTypeNumberLiteral); - node->data.number_literal.kind = NumLitUInt; - node->data.number_literal.data.x_uint = x; + node->data.number_literal.bignum = allocate_nonzero(1); + bignum_init_unsigned(node->data.number_literal.bignum, x); return node; } @@ -221,7 +222,7 @@ static AstNode *create_num_lit_signed(Context *c, int64_t x) { static AstNode *create_type_decl_node(Context *c, const char *name, AstNode *child_type_node) { AstNode *node = create_node(c, NodeTypeTypeDecl); - buf_init_from_str(&node->data.type_decl.symbol, name); + node->data.type_decl.symbol = buf_create_from_str(name); node->data.type_decl.top_level_decl.visib_mod = c->visib_mod; node->data.type_decl.child_type = child_type_node; @@ -240,7 +241,7 @@ static AstNode *create_fn_proto_node(Context *c, Buf *name, TypeTableEntry *fn_t AstNode *node = create_node(c, NodeTypeFnProto); node->data.fn_proto.is_inline = true; node->data.fn_proto.top_level_decl.visib_mod = c->visib_mod; - buf_init_from_buf(&node->data.fn_proto.name, name); + node->data.fn_proto.name = name; node->data.fn_proto.return_type = make_type_node(c, fn_type->data.fn.fn_type_id.return_type); for (int i = 0; i < fn_type->data.fn.fn_type_id.param_count; i += 1) { @@ -273,7 +274,7 @@ static AstNode *create_inline_fn_node(Context *c, Buf *fn_name, Buf *var_name, T fn_call_node->data.fn_call_expr.fn_ref_expr = unwrap_node; for (int i = 0; i < fn_type->data.fn.fn_type_id.param_count; i += 1) { AstNode *decl_node = node->data.fn_def.fn_proto->data.fn_proto.params.at(i); - Buf *param_name = &decl_node->data.param_decl.name; + Buf *param_name = decl_node->data.param_decl.name; fn_call_node->data.fn_call_expr.params.append(create_symbol_node(c, buf_ptr(param_name))); } @@ -686,10 +687,9 @@ static TypeTableEntry *resolve_qual_type(Context *c, QualType qt, const Decl *de } static void visit_fn_decl(Context *c, const FunctionDecl *fn_decl) { - Buf fn_name = BUF_INIT; - buf_init_from_str(&fn_name, decl_name(fn_decl)); + Buf *fn_name = buf_create_from_str(decl_name(fn_decl)); - if (c->fn_table.maybe_get(&fn_name)) { + if (c->fn_table.maybe_get(fn_name)) { // we already saw this function return; } @@ -697,14 +697,14 @@ static void visit_fn_decl(Context *c, const FunctionDecl *fn_decl) { TypeTableEntry *fn_type = resolve_qual_type(c, fn_decl->getType(), fn_decl); if (fn_type->id == TypeTableEntryIdInvalid) { - emit_warning(c, fn_decl, "ignoring function '%s' - unable to resolve type", buf_ptr(&fn_name)); + emit_warning(c, fn_decl, "ignoring function '%s' - unable to resolve type", buf_ptr(fn_name)); return; } assert(fn_type->id == TypeTableEntryIdFn); AstNode *node = create_node(c, NodeTypeFnProto); - buf_init_from_buf(&node->data.fn_proto.name, &fn_name); + node->data.fn_proto.name = fn_name; node->data.fn_proto.is_extern = fn_type->data.fn.fn_type_id.is_extern; node->data.fn_proto.top_level_decl.visib_mod = c->visib_mod; @@ -731,7 +731,7 @@ static void visit_fn_decl(Context *c, const FunctionDecl *fn_decl) { normalize_parent_ptrs(node); - c->fn_table.put(buf_create_from_buf(&fn_name), true); + c->fn_table.put(buf_create_from_buf(fn_name), true); c->root->data.root.top_level_decls.append(node); } @@ -937,7 +937,7 @@ static void visit_enum_decl(Context *c, const EnumDecl *enum_decl) { if (enum_type->data.enumeration.complete) { // now create top level decl for the type AstNode *enum_node = create_node(c, NodeTypeContainerDecl); - buf_init_from_buf(&enum_node->data.struct_decl.name, &enum_type->name); + enum_node->data.struct_decl.name = &enum_type->name; enum_node->data.struct_decl.kind = ContainerKindEnum; enum_node->data.struct_decl.top_level_decl.visib_mod = VisibModExport; enum_node->data.struct_decl.type_entry = enum_type; @@ -1114,7 +1114,7 @@ static void visit_record_decl(Context *c, const RecordDecl *record_decl) { if (struct_type->data.structure.complete) { // now create a top level decl node for the type AstNode *struct_node = create_node(c, NodeTypeContainerDecl); - buf_init_from_buf(&struct_node->data.struct_decl.name, &struct_type->name); + struct_node->data.struct_decl.name = &struct_type->name; struct_node->data.struct_decl.kind = ContainerKindStruct; struct_node->data.struct_decl.top_level_decl.visib_mod = VisibModExport; struct_node->data.struct_decl.type_entry = struct_type; @@ -1284,7 +1284,7 @@ static void render_aliases(Context *c) { for (int i = 0; i < c->aliases.length; i += 1) { AstNode *alias_node = c->aliases.at(i); assert(alias_node->type == NodeTypeVariableDeclaration); - Buf *name = &alias_node->data.variable_declaration.symbol; + Buf *name = alias_node->data.variable_declaration.symbol; if (name_exists(c, name)) { continue; } @@ -1327,7 +1327,7 @@ static void process_macro(Context *c, CTokenize *ctok, Buf *name, const char *ch case CTokIdStrLit: if (is_last && is_first) { AstNode *var_node = create_var_decl_node(c, buf_ptr(name), - create_str_lit_node(c, &tok->data.str_lit)); + create_str_lit_node(c, buf_create_from_buf(&tok->data.str_lit))); c->macro_table.put(name, var_node); } return; diff --git a/src/parser.cpp b/src/parser.cpp index fa77bd6dc3..4939fa94b0 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -21,6 +21,9 @@ struct ParseContext { ImportTableEntry *owner; ErrColor err_color; uint32_t *next_node_index; + // These buffers are used freqently so we preallocate them once here. + Buf *void_buf; + Buf *empty_buf; }; __attribute__ ((format (printf, 4, 5))) @@ -29,7 +32,9 @@ static void ast_asm_error(ParseContext *pc, AstNode *node, int offset, const cha assert(node->type == NodeTypeAsmExpr); - SrcPos pos = node->data.asm_expr.offset_map.at(offset); + // TODO calculate or otherwise keep track of originating line/column number for strings + //SrcPos pos = node->data.asm_expr.offset_map.at(offset); + SrcPos pos = { node->line, node->column }; va_list ap; va_start(ap, format); @@ -83,12 +88,12 @@ static AstNode *ast_create_node(ParseContext *pc, NodeType type, Token *first_to static AstNode *ast_create_void_type_node(ParseContext *pc, Token *token) { AstNode *node = ast_create_node(pc, NodeTypeSymbol, token); - buf_init_from_str(&node->data.symbol_expr.symbol, "void"); + node->data.symbol_expr.symbol = pc->void_buf; return node; } static void parse_asm_template(ParseContext *pc, AstNode *node) { - Buf *asm_template = &node->data.asm_expr.asm_template; + Buf *asm_template = node->data.asm_expr.asm_template; enum State { StateStart, @@ -170,514 +175,29 @@ static void parse_asm_template(ParseContext *pc, AstNode *node) { } } -static uint8_t parse_char_literal(ParseContext *pc, Token *token) { - // skip the single quotes at beginning and end - // convert escape sequences - bool escape = false; - int return_count = 0; - uint8_t return_value; - for (int i = token->start_pos + 1; i < token->end_pos - 1; i += 1) { - uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i); - if (escape) { - switch (c) { - case '\\': - return_value = '\\'; - return_count += 1; - break; - case 'r': - return_value = '\r'; - return_count += 1; - break; - case 'n': - return_value = '\n'; - return_count += 1; - break; - case 't': - return_value = '\t'; - return_count += 1; - break; - case '\'': - return_value = '\''; - return_count += 1; - break; - default: - ast_error(pc, token, "invalid escape character"); - } - escape = false; - } else if (c == '\\') { - escape = true; - } else { - return_value = c; - return_count += 1; - } - } - if (return_count == 0) { - ast_error(pc, token, "character literal too short"); - } else if (return_count > 1) { - ast_error(pc, token, "character literal too long"); - } - return return_value; +static Buf *token_buf(Token *token) { + assert(token->id == TokenIdStringLiteral || token->id == TokenIdSymbol); + return &token->data.str_lit.str; } -static uint32_t get_hex_digit(uint8_t c) { - switch (c) { - case '0': return 0; - case '1': return 1; - case '2': return 2; - case '3': return 3; - case '4': return 4; - case '5': return 5; - case '6': return 6; - case '7': return 7; - case '8': return 8; - case '9': return 9; - - case 'a': - case 'A': - return 10; - case 'b': - case 'B': - return 11; - case 'c': - case 'C': - return 12; - case 'd': - case 'D': - return 13; - case 'e': - case 'E': - return 14; - case 'f': - case 'F': - return 15; - default: - return UINT32_MAX; - } +static BigNum *token_bignum(Token *token) { + assert(token->id == TokenIdNumberLiteral); + return &token->data.num_lit.bignum; } -static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool *out_c_str, - ZigList *offset_map) -{ - if (token->raw_string_start > 0) { - uint8_t c1 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos); - uint8_t c2 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos + 1); - assert(c1 == 'r'); - if (out_c_str) { - *out_c_str = (c2 == 'c'); - } - const char *str = buf_ptr(pc->buf) + token->raw_string_start; - buf_init_from_mem(buf, str, token->raw_string_end - token->raw_string_start); - if (offset_map) { - SrcPos pos = {token->start_line, token->start_column}; - for (int i = token->start_pos; i < token->raw_string_start; i += 1) { - uint8_t c = buf_ptr(pc->buf)[i]; - if (c == '\n') { - pos.line += 1; - pos.column = 0; - } else { - pos.column += 1; - } - } - for (int i = token->raw_string_start; i < token->raw_string_end; i += 1) { - offset_map->append(pos); - - uint8_t c = buf_ptr(pc->buf)[i]; - if (c == '\n') { - pos.line += 1; - pos.column = 0; - } else { - pos.column += 1; - } - } - } - return; - } - - // skip the double quotes at beginning and end - // convert escape sequences - // detect c string literal - - enum State { - StatePre, - StateSkipQuot, - StateStart, - StateEscape, - StateHex1, - StateHex2, - StateUnicode, - }; - - buf_resize(buf, 0); - - int unicode_index; - int unicode_end; - - State state = StatePre; - SrcPos pos = {token->start_line, token->start_column}; - uint32_t hex_value = 0; - for (int i = token->start_pos; i < token->end_pos - 1; i += 1) { - uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i); - - switch (state) { - case StatePre: - switch (c) { - case '@': - state = StateSkipQuot; - break; - case 'c': - if (out_c_str) { - *out_c_str = true; - } else { - ast_error(pc, token, "C string literal not allowed here"); - } - state = StateSkipQuot; - break; - case '"': - state = StateStart; - break; - default: - ast_error(pc, token, "invalid string character"); - } - break; - case StateSkipQuot: - state = StateStart; - break; - case StateStart: - if (c == '\\') { - state = StateEscape; - } else { - buf_append_char(buf, c); - if (offset_map) offset_map->append(pos); - } - break; - case StateEscape: - switch (c) { - case '\\': - buf_append_char(buf, '\\'); - if (offset_map) offset_map->append(pos); - state = StateStart; - break; - case 'r': - buf_append_char(buf, '\r'); - if (offset_map) offset_map->append(pos); - state = StateStart; - break; - case 'n': - buf_append_char(buf, '\n'); - if (offset_map) offset_map->append(pos); - state = StateStart; - break; - case 't': - buf_append_char(buf, '\t'); - if (offset_map) offset_map->append(pos); - state = StateStart; - break; - case '"': - buf_append_char(buf, '"'); - if (offset_map) offset_map->append(pos); - state = StateStart; - break; - case '\'': - buf_append_char(buf, '\''); - if (offset_map) offset_map->append(pos); - state = StateStart; - break; - case 'x': - state = StateHex1; - break; - case 'u': - state = StateUnicode; - unicode_index = 0; - unicode_end = 4; - hex_value = 0; - break; - case 'U': - state = StateUnicode; - unicode_index = 0; - unicode_end = 6; - hex_value = 0; - break; - default: - ast_error(pc, token, "invalid escape character"); - } - break; - case StateHex1: - { - uint32_t hex_digit = get_hex_digit(c); - if (hex_digit == UINT32_MAX) { - ast_error(pc, token, "invalid hex digit: '%c'", c); - } - hex_value = hex_digit * 16; - state = StateHex2; - break; - } - case StateHex2: - { - uint32_t hex_digit = get_hex_digit(c); - if (hex_digit == UINT32_MAX) { - ast_error(pc, token, "invalid hex digit: '%c'", c); - } - hex_value += hex_digit; - assert(hex_value >= 0 && hex_value <= 255); - buf_append_char(buf, hex_value); - state = StateStart; - break; - } - case StateUnicode: - { - uint32_t hex_digit = get_hex_digit(c); - if (hex_digit == UINT32_MAX) { - ast_error(pc, token, "invalid hex digit: '%c'", c); - } - hex_value *= 16; - hex_value += hex_digit; - unicode_index += 1; - if (unicode_index >= unicode_end) { - if (hex_value <= 0x7f) { - // 00000000 00000000 00000000 0xxxxxxx - buf_append_char(buf, hex_value); - } else if (hex_value <= 0x7ff) { - // 00000000 00000000 00000xxx xx000000 - buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6))); - // 00000000 00000000 00000000 00xxxxxx - buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); - } else if (hex_value <= 0xffff) { - // 00000000 00000000 xxxx0000 00000000 - buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12))); - // 00000000 00000000 0000xxxx xx000000 - buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); - } else if (hex_value <= 0x10ffff) { - // 00000000 000xxx00 00000000 00000000 - buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18))); - // 00000000 000000xx xxxx0000 00000000 - buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f))); - // 00000000 00000000 0000xxxx xx000000 - buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); - } else { - ast_error(pc, token, "unicode value out of range: %x", hex_value); - } - state = StateStart; - } - break; - } - } - if (c == '\n') { - pos.line += 1; - pos.column = 0; - } else { - pos.column += 1; - } - } - assert(state == StateStart); - if (offset_map) offset_map->append(pos); +static uint8_t token_char_lit(Token *token) { + assert(token->id == TokenIdCharLiteral); + return token->data.char_lit.c; } static void ast_buf_from_token(ParseContext *pc, Token *token, Buf *buf) { - uint8_t *first_char = (uint8_t *)buf_ptr(pc->buf) + token->start_pos; - bool at_sign = *first_char == '@'; - if (at_sign) { - parse_string_literal(pc, token, buf, nullptr, nullptr); + if (token->id == TokenIdSymbol) { + buf_init_from_buf(buf, token_buf(token)); } else { buf_init_from_mem(buf, buf_ptr(pc->buf) + token->start_pos, token->end_pos - token->start_pos); } } - -static unsigned long long parse_int_digits(ParseContext *pc, int digits_start, int digits_end, int radix, - int skip_index, bool *overflow) -{ - unsigned long long x = 0; - - for (int i = digits_start; i < digits_end; i++) { - if (i == skip_index) - continue; - uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i); - unsigned long long digit = get_digit_value(c); - - // x *= radix; - if (__builtin_umulll_overflow(x, radix, &x)) { - *overflow = true; - return 0; - } - - // x += digit - if (__builtin_uaddll_overflow(x, digit, &x)) { - *overflow = true; - return 0; - } - } - return x; -} - -static void parse_number_literal(ParseContext *pc, Token *token, AstNodeNumberLiteral *num_lit) { - assert(token->id == TokenIdNumberLiteral); - - int whole_number_start = token->start_pos; - if (token->radix != 10) { - // skip the "0x" - whole_number_start += 2; - } - - int whole_number_end = token->decimal_point_pos; - if (whole_number_end <= whole_number_start) { - // TODO: error for empty whole number part - num_lit->overflow = true; - return; - } - - if (token->decimal_point_pos == token->end_pos) { - // integer - unsigned long long whole_number = parse_int_digits(pc, whole_number_start, whole_number_end, - token->radix, -1, &num_lit->overflow); - if (num_lit->overflow) return; - - num_lit->data.x_uint = whole_number; - num_lit->kind = NumLitUInt; - } else { - // float - - if (token->radix == 10) { - // use a third-party base-10 float parser - char *str_begin = buf_ptr(pc->buf) + whole_number_start; - char *str_end; - errno = 0; - double x = strtod(str_begin, &str_end); - if (errno) { - // TODO: forward error to user - num_lit->overflow = true; - return; - } - assert(str_end == buf_ptr(pc->buf) + token->end_pos); - num_lit->data.x_float = x; - num_lit->kind = NumLitFloat; - return; - } - - if (token->decimal_point_pos < token->exponent_marker_pos) { - // fraction - int fraction_start = token->decimal_point_pos + 1; - int fraction_end = token->exponent_marker_pos; - if (fraction_end <= fraction_start) { - // TODO: error for empty fraction part - num_lit->overflow = true; - return; - } - } - - // trim leading and trailing zeros in the significand digit sequence - int significand_start = whole_number_start; - for (; significand_start < token->exponent_marker_pos; significand_start++) { - if (significand_start == token->decimal_point_pos) - continue; - uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + significand_start); - if (c != '0') - break; - } - int significand_end = token->exponent_marker_pos; - for (; significand_end - 1 > significand_start; significand_end--) { - if (significand_end - 1 <= token->decimal_point_pos) { - significand_end = token->decimal_point_pos; - break; - } - uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + significand_end - 1); - if (c != '0') - break; - } - - unsigned long long significand_as_int = parse_int_digits(pc, significand_start, significand_end, - token->radix, token->decimal_point_pos, &num_lit->overflow); - if (num_lit->overflow) return; - - int exponent_in_bin_or_dec = 0; - if (significand_end > token->decimal_point_pos) { - exponent_in_bin_or_dec = token->decimal_point_pos + 1 - significand_end; - if (token->radix == 2) { - // already good - } else if (token->radix == 8) { - exponent_in_bin_or_dec *= 3; - } else if (token->radix == 10) { - // already good - } else if (token->radix == 16) { - exponent_in_bin_or_dec *= 4; - } else zig_unreachable(); - } - - if (token->exponent_marker_pos < token->end_pos) { - // exponent - int exponent_start = token->exponent_marker_pos + 1; - int exponent_end = token->end_pos; - if (exponent_end <= exponent_start) { - // TODO: error for empty exponent part - num_lit->overflow = true; - return; - } - bool is_exponent_negative = false; - uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + exponent_start); - if (c == '+') { - exponent_start += 1; - } else if (c == '-') { - exponent_start += 1; - is_exponent_negative = true; - } - - if (exponent_end <= exponent_start) { - // TODO: error for empty exponent part - num_lit->overflow = true; - return; - } - - unsigned long long specified_exponent = parse_int_digits(pc, exponent_start, exponent_end, - 10, -1, &num_lit->overflow); - // TODO: this check is a little silly - if (specified_exponent >= LLONG_MAX) { - num_lit->overflow = true; - return; - } - - if (is_exponent_negative) { - exponent_in_bin_or_dec -= specified_exponent; - } else { - exponent_in_bin_or_dec += specified_exponent; - } - } - - uint64_t significand_bits; - uint64_t exponent_bits; - if (significand_as_int != 0) { - // normalize the significand - if (token->radix == 10) { - zig_panic("TODO: decimal floats"); - } else { - int significand_magnitude_in_bin = __builtin_clzll(1) - __builtin_clzll(significand_as_int); - exponent_in_bin_or_dec += significand_magnitude_in_bin; - if (!(-1023 <= exponent_in_bin_or_dec && exponent_in_bin_or_dec < 1023)) { - num_lit->overflow = true; - return; - } - - // this should chop off exactly one 1 bit from the top. - significand_bits = ((uint64_t)significand_as_int << (52 - significand_magnitude_in_bin)) & 0xfffffffffffffULL; - exponent_bits = exponent_in_bin_or_dec + 1023; - } - } else { - // 0 is all 0's - significand_bits = 0; - exponent_bits = 0; - } - - uint64_t double_bits = (exponent_bits << 52) | significand_bits; - double x = *(double *)&double_bits; - - num_lit->data.x_float = x; - num_lit->kind = NumLitFloat; - } -} - - __attribute__ ((noreturn)) static void ast_invalid_token_error(ParseContext *pc, Token *token) { Buf token_value = BUF_INIT; @@ -723,7 +243,7 @@ static AstNode *ast_parse_directive(ParseContext *pc, int *token_index) { Token *name_symbol = ast_eat_token(pc, token_index, TokenIdSymbol); - ast_buf_from_token(pc, name_symbol, &node->data.directive.name); + node->data.directive.name = token_buf(name_symbol); node->data.directive.expr = ast_parse_grouped_expr(pc, token_index, true); @@ -769,12 +289,12 @@ static AstNode *ast_parse_param_decl(ParseContext *pc, int *token_index) { token = &pc->tokens->at(*token_index); } - buf_resize(&node->data.param_decl.name, 0); + node->data.param_decl.name = pc->empty_buf; if (token->id == TokenIdSymbol) { Token *next_token = &pc->tokens->at(*token_index + 1); if (next_token->id == TokenIdColon) { - ast_buf_from_token(pc, token, &node->data.param_decl.name); + node->data.param_decl.name = token_buf(token); *token_index += 2; } } @@ -915,8 +435,8 @@ static void ast_parse_asm_input_item(ParseContext *pc, int *token_index, AstNode ast_eat_token(pc, token_index, TokenIdRParen); AsmInput *asm_input = allocate(1); - ast_buf_from_token(pc, alias, &asm_input->asm_symbolic_name); - parse_string_literal(pc, constraint, &asm_input->constraint, nullptr, nullptr); + asm_input->asm_symbolic_name = token_buf(alias); + asm_input->constraint = token_buf(constraint); asm_input->expr = expr_node; node->data.asm_expr.input_list.append(asm_input); } @@ -938,7 +458,7 @@ static void ast_parse_asm_output_item(ParseContext *pc, int *token_index, AstNod Token *token = &pc->tokens->at(*token_index); *token_index += 1; if (token->id == TokenIdSymbol) { - ast_buf_from_token(pc, token, &asm_output->variable_name); + asm_output->variable_name = token_buf(token); } else if (token->id == TokenIdArrow) { asm_output->return_type = ast_parse_prefix_op_expr(pc, token_index, true); } else { @@ -947,8 +467,8 @@ static void ast_parse_asm_output_item(ParseContext *pc, int *token_index, AstNod ast_eat_token(pc, token_index, TokenIdRParen); - ast_buf_from_token(pc, alias, &asm_output->asm_symbolic_name); - parse_string_literal(pc, constraint, &asm_output->constraint, nullptr, nullptr); + asm_output->asm_symbolic_name = token_buf(alias); + asm_output->constraint = token_buf(constraint); node->data.asm_expr.output_list.append(asm_output); } @@ -968,8 +488,7 @@ static void ast_parse_asm_clobbers(ParseContext *pc, int *token_index, AstNode * ast_expect_token(pc, string_tok, TokenIdStringLiteral); *token_index += 1; - Buf *clobber_buf = buf_alloc(); - parse_string_literal(pc, string_tok, clobber_buf, nullptr, nullptr); + Buf *clobber_buf = token_buf(string_tok); node->data.asm_expr.clobber_list.append(clobber_buf); Token *comma = &pc->tokens->at(*token_index); @@ -1072,19 +591,14 @@ static AstNode *ast_parse_asm_expr(ParseContext *pc, int *token_index, bool mand ast_expect_token(pc, lparen_tok, TokenIdLParen); *token_index += 1; - Token *template_tok = &pc->tokens->at(*token_index); - ast_expect_token(pc, template_tok, TokenIdStringLiteral); - *token_index += 1; + Token *template_tok = ast_eat_token(pc, token_index, TokenIdStringLiteral); - parse_string_literal(pc, template_tok, &node->data.asm_expr.asm_template, nullptr, - &node->data.asm_expr.offset_map); + node->data.asm_expr.asm_template = token_buf(template_tok); parse_asm_template(pc, node); ast_parse_asm_output(pc, token_index, node); - Token *rparen_tok = &pc->tokens->at(*token_index); - ast_expect_token(pc, rparen_tok, TokenIdRParen); - *token_index += 1; + ast_eat_token(pc, token_index, TokenIdRParen); normalize_parent_ptrs(node); return node; @@ -1099,17 +613,19 @@ static AstNode *ast_parse_primary_expr(ParseContext *pc, int *token_index, bool if (token->id == TokenIdNumberLiteral) { AstNode *node = ast_create_node(pc, NodeTypeNumberLiteral, token); - parse_number_literal(pc, token, &node->data.number_literal); + node->data.number_literal.bignum = token_bignum(token); + node->data.number_literal.overflow = token->data.num_lit.overflow; *token_index += 1; return node; } else if (token->id == TokenIdStringLiteral) { AstNode *node = ast_create_node(pc, NodeTypeStringLiteral, token); - parse_string_literal(pc, token, &node->data.string_literal.buf, &node->data.string_literal.c, nullptr); + node->data.string_literal.buf = token_buf(token); + node->data.string_literal.c = token->data.str_lit.is_c_str; *token_index += 1; return node; } else if (token->id == TokenIdCharLiteral) { AstNode *node = ast_create_node(pc, NodeTypeCharLiteral, token); - node->data.char_literal.value = parse_char_literal(pc, token); + node->data.char_literal.value = token_char_lit(token); *token_index += 1; return node; } else if (token->id == TokenIdKeywordTrue) { @@ -1155,7 +671,7 @@ static AstNode *ast_parse_primary_expr(ParseContext *pc, int *token_index, bool *token_index += 1; Token *name_tok = ast_eat_token(pc, token_index, TokenIdSymbol); AstNode *name_node = ast_create_node(pc, NodeTypeSymbol, name_tok); - ast_buf_from_token(pc, name_tok, &name_node->data.symbol_expr.symbol); + name_node->data.symbol_expr.symbol = token_buf(name_tok); AstNode *node = ast_create_node(pc, NodeTypeFnCallExpr, token); node->data.fn_call_expr.fn_ref_expr = name_node; @@ -1168,7 +684,7 @@ static AstNode *ast_parse_primary_expr(ParseContext *pc, int *token_index, bool } else if (token->id == TokenIdSymbol) { *token_index += 1; AstNode *node = ast_create_node(pc, NodeTypeSymbol, token); - ast_buf_from_token(pc, token, &node->data.symbol_expr.symbol); + node->data.symbol_expr.symbol = token_buf(token); return node; } else if (token->id == TokenIdKeywordGoto) { AstNode *node = ast_create_node(pc, NodeTypeGoto, token); @@ -1178,7 +694,7 @@ static AstNode *ast_parse_primary_expr(ParseContext *pc, int *token_index, bool *token_index += 1; ast_expect_token(pc, dest_symbol, TokenIdSymbol); - ast_buf_from_token(pc, dest_symbol, &node->data.goto_expr.name); + node->data.goto_expr.name = token_buf(dest_symbol); return node; } @@ -1243,7 +759,7 @@ static AstNode *ast_parse_curly_suffix_expr(ParseContext *pc, int *token_index, AstNode *field_node = ast_create_node(pc, NodeTypeStructValueField, token); - ast_buf_from_token(pc, field_name_tok, &field_node->data.struct_val_field.name); + field_node->data.struct_val_field.name = token_buf(field_name_tok); field_node->data.struct_val_field.expr = ast_parse_expression(pc, token_index, true); normalize_parent_ptrs(field_node); @@ -1370,7 +886,7 @@ static AstNode *ast_parse_suffix_op_expr(ParseContext *pc, int *token_index, boo AstNode *node = ast_create_node(pc, NodeTypeFieldAccessExpr, first_token); node->data.field_access_expr.struct_expr = primary_expr; - ast_buf_from_token(pc, name_token, &node->data.field_access_expr.field_name); + node->data.field_access_expr.field_name = token_buf(name_token); normalize_parent_ptrs(node); primary_expr = node; @@ -1819,10 +1335,10 @@ static AstNode *ast_parse_if_expr(ParseContext *pc, int *token_index, bool manda *token_index += 1; node->data.if_var_expr.var_is_ptr = true; Token *name_token = ast_eat_token(pc, token_index, TokenIdSymbol); - ast_buf_from_token(pc, name_token, &node->data.if_var_expr.var_decl.symbol); + node->data.if_var_expr.var_decl.symbol = token_buf(name_token); } else if (star_or_symbol->id == TokenIdSymbol) { *token_index += 1; - ast_buf_from_token(pc, star_or_symbol, &node->data.if_var_expr.var_decl.symbol); + node->data.if_var_expr.var_decl.symbol = token_buf(star_or_symbol); } else { ast_invalid_token_error(pc, star_or_symbol); } @@ -1974,7 +1490,7 @@ static AstNode *ast_parse_variable_declaration_expr(ParseContext *pc, int *token node->data.variable_declaration.top_level_decl.directives = directives; Token *name_token = ast_eat_token(pc, token_index, TokenIdSymbol); - ast_buf_from_token(pc, name_token, &node->data.variable_declaration.symbol); + node->data.variable_declaration.symbol = token_buf(name_token); Token *eq_or_colon = &pc->tokens->at(*token_index); *token_index += 1; @@ -2067,7 +1583,7 @@ static AstNode *ast_parse_while_expr(ParseContext *pc, int *token_index, bool ma static AstNode *ast_parse_symbol(ParseContext *pc, int *token_index) { Token *token = ast_eat_token(pc, token_index, TokenIdSymbol); AstNode *node = ast_create_node(pc, NodeTypeSymbol, token); - ast_buf_from_token(pc, token, &node->data.symbol_expr.symbol); + node->data.symbol_expr.symbol = token_buf(token); return node; } @@ -2405,7 +1921,7 @@ static AstNode *ast_parse_label(ParseContext *pc, int *token_index, bool mandato *token_index += 2; AstNode *node = ast_create_node(pc, NodeTypeLabel, symbol_token); - ast_buf_from_token(pc, symbol_token, &node->data.label.name); + node->data.label.name = token_buf(symbol_token); return node; } @@ -2413,7 +1929,7 @@ static AstNode *ast_create_void_expr(ParseContext *pc, Token *token) { AstNode *node = ast_create_node(pc, NodeTypeContainerInitExpr, token); node->data.container_init_expr.type = ast_create_node(pc, NodeTypeSymbol, token); node->data.container_init_expr.kind = ContainerInitKindArray; - buf_init_from_str(&node->data.container_init_expr.type->data.symbol_expr.symbol, "void"); + node->data.container_init_expr.type->data.symbol_expr.symbol = pc->void_buf; normalize_parent_ptrs(node); return node; } @@ -2508,9 +2024,9 @@ static AstNode *ast_parse_fn_proto(ParseContext *pc, int *token_index, bool mand Token *fn_name = &pc->tokens->at(*token_index); if (fn_name->id == TokenIdSymbol) { *token_index += 1; - ast_buf_from_token(pc, fn_name, &node->data.fn_proto.name); + node->data.fn_proto.name = token_buf(fn_name); } else { - buf_resize(&node->data.fn_proto.name, 0); + node->data.fn_proto.name = pc->empty_buf; } ast_parse_param_decl_list(pc, token_index, &node->data.fn_proto.params, &node->data.fn_proto.is_var_args); @@ -2663,7 +2179,7 @@ static AstNode *ast_parse_container_decl(ParseContext *pc, int *token_index, AstNode *node = ast_create_node(pc, NodeTypeContainerDecl, first_token); node->data.struct_decl.kind = kind; - ast_buf_from_token(pc, struct_name, &node->data.struct_decl.name); + node->data.struct_decl.name = token_buf(struct_name); node->data.struct_decl.top_level_decl.visib_mod = visib_mod; node->data.struct_decl.top_level_decl.directives = directives; @@ -2729,8 +2245,7 @@ static AstNode *ast_parse_container_decl(ParseContext *pc, int *token_index, field_node->data.struct_field.top_level_decl.visib_mod = visib_mod; field_node->data.struct_field.top_level_decl.directives = directive_list; - - ast_buf_from_token(pc, token, &field_node->data.struct_field.name); + field_node->data.struct_field.name = token_buf(token); Token *expr_or_comma = &pc->tokens->at(*token_index); if (expr_or_comma->id == TokenIdComma) { @@ -2772,7 +2287,7 @@ static AstNode *ast_parse_error_value_decl(ParseContext *pc, int *token_index, AstNode *node = ast_create_node(pc, NodeTypeErrorValueDecl, first_token); node->data.error_value_decl.top_level_decl.visib_mod = visib_mod; node->data.error_value_decl.top_level_decl.directives = directives; - ast_buf_from_token(pc, name_tok, &node->data.error_value_decl.name); + node->data.error_value_decl.name = token_buf(name_tok); normalize_parent_ptrs(node); return node; @@ -2795,7 +2310,7 @@ static AstNode *ast_parse_type_decl(ParseContext *pc, int *token_index, ast_eat_token(pc, token_index, TokenIdEq); AstNode *node = ast_create_node(pc, NodeTypeTypeDecl, first_token); - ast_buf_from_token(pc, name_tok, &node->data.type_decl.symbol); + node->data.type_decl.symbol = token_buf(name_tok); node->data.type_decl.child_type = ast_parse_prefix_op_expr(pc, token_index, true); ast_eat_token(pc, token_index, TokenIdSemicolon); @@ -2901,6 +2416,8 @@ AstNode *ast_parse(Buf *buf, ZigList *tokens, ImportTableEntry *owner, ErrColor err_color, uint32_t *next_node_index) { ParseContext pc = {0}; + pc.void_buf = buf_create_from_str("void"); + pc.empty_buf = buf_create_from_str(""); pc.err_color = err_color; pc.owner = owner; pc.buf = buf; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 52a815f950..33abf35062 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #define WHITESPACE \ ' ': \ @@ -30,7 +33,7 @@ '0': \ case DIGIT_NON_ZERO -#define ALPHA_EXCEPT_CR \ +#define ALPHA_EXCEPT_C \ 'a': \ case 'b': \ /*case 'c':*/ \ @@ -48,7 +51,7 @@ case 'o': \ case 'p': \ case 'q': \ - /*case 'r':*/ \ + case 'r': \ case 's': \ case 't': \ case 'u': \ @@ -85,77 +88,93 @@ case 'Z' #define ALPHA \ - ALPHA_EXCEPT_CR: \ - case 'c': \ - case 'r' - -#define SYMBOL_CHAR \ - SYMBOL_CHAR_EXCEPT_C: \ + ALPHA_EXCEPT_C: \ case 'c' -#define SYMBOL_CHAR_EXCEPT_C \ - ALPHA_EXCEPT_CR: \ - case 'r': \ +#define SYMBOL_CHAR \ + ALPHA_EXCEPT_C: \ case DIGIT: \ - case '_' + case '_': \ + case 'c' #define SYMBOL_START \ ALPHA: \ case '_' -#define HEX_DIGIT \ - 'a': \ - case 'b': \ - case 'c': \ - case 'd': \ - case 'e': \ - case 'f': \ - case 'A': \ - case 'B': \ - case 'C': \ - case 'D': \ - case 'E': \ - case 'F': \ - case DIGIT +struct ZigKeyword { + const char *text; + TokenId token_id; +}; -const char * zig_keywords[] = { - "true", "false", "null", "fn", "return", "var", "const", "extern", - "pub", "export", "use", "if", "else", "goto", "asm", - "volatile", "struct", "enum", "while", "for", "continue", "break", - "null", "noalias", "switch", "undefined", "error", "type", "inline", - "defer", "union", +static const struct ZigKeyword zig_keywords[] = { + {"asm", TokenIdKeywordAsm}, + {"break", TokenIdKeywordBreak}, + {"const", TokenIdKeywordConst}, + {"continue", TokenIdKeywordContinue}, + {"defer", TokenIdKeywordDefer}, + {"else", TokenIdKeywordElse}, + {"enum", TokenIdKeywordEnum}, + {"error", TokenIdKeywordError}, + {"export", TokenIdKeywordExport}, + {"extern", TokenIdKeywordExtern}, + {"false", TokenIdKeywordFalse}, + {"fn", TokenIdKeywordFn}, + {"for", TokenIdKeywordFor}, + {"goto", TokenIdKeywordGoto}, + {"if", TokenIdKeywordIf}, + {"inline", TokenIdKeywordInline}, + {"noalias", TokenIdKeywordNoAlias}, + {"null", TokenIdKeywordNull}, + {"pub", TokenIdKeywordPub}, + {"return", TokenIdKeywordReturn}, + {"struct", TokenIdKeywordStruct}, + {"switch", TokenIdKeywordSwitch}, + {"true", TokenIdKeywordTrue}, + {"type", TokenIdKeywordType}, + {"undefined", TokenIdKeywordUndefined}, + {"union", TokenIdKeywordUnion}, + {"use", TokenIdKeywordUse}, + {"var", TokenIdKeywordVar}, + {"volatile", TokenIdKeywordVolatile}, + {"while", TokenIdKeywordWhile}, }; bool is_zig_keyword(Buf *buf) { for (int i = 0; i < array_length(zig_keywords); i += 1) { - if (buf_eql_str(buf, zig_keywords[i])) { + if (buf_eql_str(buf, zig_keywords[i].text)) { return true; } } return false; } +static bool is_symbol_char(uint8_t c) { + switch (c) { + case SYMBOL_CHAR: + return true; + default: + return false; + } +} + enum TokenizeState { TokenizeStateStart, TokenizeStateSymbol, - TokenizeStateSymbolFirst, - TokenizeStateSymbolFirstRaw, - TokenizeStateFirstR, + TokenizeStateSymbolFirstC, TokenizeStateZero, // "0", which might lead to "0x" TokenizeStateNumber, // "123", "0x123" + TokenizeStateNumberDot, TokenizeStateFloatFraction, // "123.456", "0x123.456" TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p" TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5" TokenizeStateString, TokenizeStateStringEscape, - TokenizeStateRawString, - TokenizeStateRawStringContents, - TokenizeStateRawStringMaybeEnd, TokenizeStateCharLiteral, TokenizeStateCharLiteralEnd, TokenizeStateSawStar, TokenizeStateSawStarPercent, TokenizeStateSawSlash, + TokenizeStateSawBackslash, TokenizeStateSawPercent, TokenizeStateSawPlus, TokenizeStateSawPlusPercent, @@ -167,6 +186,9 @@ enum TokenizeState { TokenizeStateSawPipe, TokenizeStateSawPipePipe, TokenizeStateLineComment, + TokenizeStateLineString, + TokenizeStateLineStringEnd, + TokenizeStateLineStringContinue, TokenizeStateSawEq, TokenizeStateSawBang, TokenizeStateSawLessThan, @@ -178,7 +200,7 @@ enum TokenizeState { TokenizeStateSawDotDot, TokenizeStateSawQuestionMark, TokenizeStateSawAtSign, - TokenizeStateHex, + TokenizeStateCharCode, TokenizeStateError, }; @@ -192,10 +214,16 @@ struct Tokenize { int column; Token *cur_tok; Tokenization *out; - int raw_string_id_start; - int raw_string_id_end; - int raw_string_id_cmp_pos; - int hex_chars_left; + uint32_t radix; + int32_t exp_add_amt; + bool is_exp_negative; + bool is_num_lit_float; + size_t char_code_index; + size_t char_code_end; + bool unicode; + uint32_t char_code; + int exponent_in_bin_or_dec; + BigNum specified_exponent; }; __attribute__ ((format (printf, 2, 3))) @@ -216,19 +244,28 @@ static void tokenize_error(Tokenize *t, const char *format, ...) { va_end(ap); } +static void set_token_id(Tokenize *t, Token *token, TokenId id) { + token->id = id; + + if (id == TokenIdNumberLiteral) { + token->data.num_lit.overflow = false; + } else if (id == TokenIdStringLiteral || id == TokenIdSymbol) { + memset(&token->data.str_lit.str, 0, sizeof(Buf)); + buf_resize(&token->data.str_lit.str, 0); + token->data.str_lit.is_c_str = false; + } +} + static void begin_token(Tokenize *t, TokenId id) { assert(!t->cur_tok); t->tokens->add_one(); Token *token = &t->tokens->last(); token->start_line = t->line; token->start_column = t->column; - token->id = id; token->start_pos = t->pos; - token->radix = 0; - token->decimal_point_pos = 0; - token->exponent_marker_pos = 0; - token->raw_string_start = 0; - token->raw_string_end = 0; + + set_token_id(t, token, id); + t->cur_tok = token; } @@ -237,83 +274,82 @@ static void cancel_token(Tokenize *t) { t->cur_tok = nullptr; } +static void end_float_token(Tokenize *t) { + t->cur_tok->data.num_lit.bignum.kind = BigNumKindFloat; + + if (t->radix == 10) { + char *str_begin = buf_ptr(t->buf) + t->cur_tok->start_pos; + char *str_end; + errno = 0; + t->cur_tok->data.num_lit.bignum.data.x_float = strtod(str_begin, &str_end); + if (errno) { + t->cur_tok->data.num_lit.overflow = true; + return; + } + assert(str_end == buf_ptr(t->buf) + t->cur_tok->end_pos); + return; + } + + + if (t->specified_exponent.data.x_uint >= INT_MAX) { + t->cur_tok->data.num_lit.overflow = true; + return; + } + + int64_t specified_exponent = t->specified_exponent.data.x_uint; + if (t->is_exp_negative) { + specified_exponent = -specified_exponent; + } + t->exponent_in_bin_or_dec += specified_exponent; + + uint64_t significand = t->cur_tok->data.num_lit.bignum.data.x_uint; + uint64_t significand_bits; + uint64_t exponent_bits; + if (significand == 0) { + // 0 is all 0's + significand_bits = 0; + exponent_bits = 0; + } else { + // normalize the significand + if (t->radix == 10) { + zig_panic("TODO: decimal floats"); + } else { + int significand_magnitude_in_bin = __builtin_clzll(1) - __builtin_clzll(significand); + t->exponent_in_bin_or_dec += significand_magnitude_in_bin; + if (!(-1023 <= t->exponent_in_bin_or_dec && t->exponent_in_bin_or_dec < 1023)) { + t->cur_tok->data.num_lit.overflow = true; + } else { + // this should chop off exactly one 1 bit from the top. + significand_bits = ((uint64_t)significand << (52 - significand_magnitude_in_bin)) & 0xfffffffffffffULL; + exponent_bits = t->exponent_in_bin_or_dec + 1023; + } + } + } + uint64_t double_bits = (exponent_bits << 52) | significand_bits; + memcpy(&t->cur_tok->data.num_lit.bignum.data.x_float, &double_bits, sizeof(double)); +} + static void end_token(Tokenize *t) { assert(t->cur_tok); t->cur_tok->end_pos = t->pos + 1; - // normalize number literal parsing stuff if (t->cur_tok->id == TokenIdNumberLiteral) { - if (t->cur_tok->exponent_marker_pos == 0) { - t->cur_tok->exponent_marker_pos = t->cur_tok->end_pos; + if (t->cur_tok->data.num_lit.overflow) { + return; } - if (t->cur_tok->decimal_point_pos == 0) { - t->cur_tok->decimal_point_pos = t->cur_tok->exponent_marker_pos; + if (t->is_num_lit_float) { + end_float_token(t); } - } + } else if (t->cur_tok->id == TokenIdSymbol) { + char *token_mem = buf_ptr(t->buf) + t->cur_tok->start_pos; + int token_len = t->cur_tok->end_pos - t->cur_tok->start_pos; - char *token_mem = buf_ptr(t->buf) + t->cur_tok->start_pos; - int token_len = t->cur_tok->end_pos - t->cur_tok->start_pos; - - if (mem_eql_str(token_mem, token_len, "fn")) { - t->cur_tok->id = TokenIdKeywordFn; - } else if (mem_eql_str(token_mem, token_len, "return")) { - t->cur_tok->id = TokenIdKeywordReturn; - } else if (mem_eql_str(token_mem, token_len, "var")) { - t->cur_tok->id = TokenIdKeywordVar; - } else if (mem_eql_str(token_mem, token_len, "const")) { - t->cur_tok->id = TokenIdKeywordConst; - } else if (mem_eql_str(token_mem, token_len, "extern")) { - t->cur_tok->id = TokenIdKeywordExtern; - } else if (mem_eql_str(token_mem, token_len, "pub")) { - t->cur_tok->id = TokenIdKeywordPub; - } else if (mem_eql_str(token_mem, token_len, "export")) { - t->cur_tok->id = TokenIdKeywordExport; - } else if (mem_eql_str(token_mem, token_len, "use")) { - t->cur_tok->id = TokenIdKeywordUse; - } else if (mem_eql_str(token_mem, token_len, "true")) { - t->cur_tok->id = TokenIdKeywordTrue; - } else if (mem_eql_str(token_mem, token_len, "false")) { - t->cur_tok->id = TokenIdKeywordFalse; - } else if (mem_eql_str(token_mem, token_len, "if")) { - t->cur_tok->id = TokenIdKeywordIf; - } else if (mem_eql_str(token_mem, token_len, "else")) { - t->cur_tok->id = TokenIdKeywordElse; - } else if (mem_eql_str(token_mem, token_len, "goto")) { - t->cur_tok->id = TokenIdKeywordGoto; - } else if (mem_eql_str(token_mem, token_len, "volatile")) { - t->cur_tok->id = TokenIdKeywordVolatile; - } else if (mem_eql_str(token_mem, token_len, "asm")) { - t->cur_tok->id = TokenIdKeywordAsm; - } else if (mem_eql_str(token_mem, token_len, "struct")) { - t->cur_tok->id = TokenIdKeywordStruct; - } else if (mem_eql_str(token_mem, token_len, "enum")) { - t->cur_tok->id = TokenIdKeywordEnum; - } else if (mem_eql_str(token_mem, token_len, "union")) { - t->cur_tok->id = TokenIdKeywordUnion; - } else if (mem_eql_str(token_mem, token_len, "for")) { - t->cur_tok->id = TokenIdKeywordFor; - } else if (mem_eql_str(token_mem, token_len, "while")) { - t->cur_tok->id = TokenIdKeywordWhile; - } else if (mem_eql_str(token_mem, token_len, "continue")) { - t->cur_tok->id = TokenIdKeywordContinue; - } else if (mem_eql_str(token_mem, token_len, "break")) { - t->cur_tok->id = TokenIdKeywordBreak; - } else if (mem_eql_str(token_mem, token_len, "null")) { - t->cur_tok->id = TokenIdKeywordNull; - } else if (mem_eql_str(token_mem, token_len, "noalias")) { - t->cur_tok->id = TokenIdKeywordNoAlias; - } else if (mem_eql_str(token_mem, token_len, "switch")) { - t->cur_tok->id = TokenIdKeywordSwitch; - } else if (mem_eql_str(token_mem, token_len, "undefined")) { - t->cur_tok->id = TokenIdKeywordUndefined; - } else if (mem_eql_str(token_mem, token_len, "error")) { - t->cur_tok->id = TokenIdKeywordError; - } else if (mem_eql_str(token_mem, token_len, "type")) { - t->cur_tok->id = TokenIdKeywordType; - } else if (mem_eql_str(token_mem, token_len, "inline")) { - t->cur_tok->id = TokenIdKeywordInline; - } else if (mem_eql_str(token_mem, token_len, "defer")) { - t->cur_tok->id = TokenIdKeywordDefer; + for (size_t i = 0; i < array_length(zig_keywords); i += 1) { + if (mem_eql_str(token_mem, token_len, zig_keywords[i].text)) { + t->cur_tok->id = zig_keywords[i].token_id; + break; + } + } } t->cur_tok = nullptr; @@ -327,7 +363,7 @@ static bool is_exponent_signifier(uint8_t c, int radix) { } } -int get_digit_value(uint8_t c) { +static uint32_t get_digit_value(uint8_t c) { if ('0' <= c && c <= '9') { return c - '0'; } @@ -337,7 +373,19 @@ int get_digit_value(uint8_t c) { if ('a' <= c && c <= 'z') { return c - 'a' + 10; } - return -1; + return UINT32_MAX; +} + +void handle_string_escape(Tokenize *t, uint8_t c) { + if (t->cur_tok->id == TokenIdCharLiteral) { + t->cur_tok->data.char_lit.c = c; + t->state = TokenizeStateCharLiteralEnd; + } else if (t->cur_tok->id == TokenIdStringLiteral || t->cur_tok->id == TokenIdSymbol) { + buf_append_char(&t->cur_tok->data.str_lit.str, c); + t->state = TokenizeStateString; + } else { + zig_unreachable(); + } } void tokenize(Buf *buf, Tokenization *out) { @@ -359,27 +407,35 @@ void tokenize(Buf *buf, Tokenization *out) { case WHITESPACE: break; case 'c': - t.state = TokenizeStateSymbolFirst; + t.state = TokenizeStateSymbolFirstC; begin_token(&t, TokenIdSymbol); + buf_append_char(&t.cur_tok->data.str_lit.str, c); break; - case 'r': - t.state = TokenizeStateFirstR; - begin_token(&t, TokenIdSymbol); - break; - case ALPHA_EXCEPT_CR: + case ALPHA_EXCEPT_C: case '_': t.state = TokenizeStateSymbol; begin_token(&t, TokenIdSymbol); + buf_append_char(&t.cur_tok->data.str_lit.str, c); break; case '0': t.state = TokenizeStateZero; begin_token(&t, TokenIdNumberLiteral); - t.cur_tok->radix = 10; + t.radix = 10; + t.exp_add_amt = 1; + t.exponent_in_bin_or_dec = 0; + t.is_num_lit_float = false; + bignum_init_unsigned(&t.cur_tok->data.num_lit.bignum, 0); + bignum_init_unsigned(&t.specified_exponent, 0); break; case DIGIT_NON_ZERO: t.state = TokenizeStateNumber; begin_token(&t, TokenIdNumberLiteral); - t.cur_tok->radix = 10; + t.radix = 10; + t.exp_add_amt = 1; + t.exponent_in_bin_or_dec = 0; + t.is_num_lit_float = false; + bignum_init_unsigned(&t.cur_tok->data.num_lit.bignum, get_digit_value(c)); + bignum_init_unsigned(&t.specified_exponent, 0); break; case '"': begin_token(&t, TokenIdStringLiteral); @@ -437,6 +493,10 @@ void tokenize(Buf *buf, Tokenization *out) { begin_token(&t, TokenIdSlash); t.state = TokenizeStateSawSlash; break; + case '\\': + begin_token(&t, TokenIdStringLiteral); + t.state = TokenizeStateSawBackslash; + break; case '%': begin_token(&t, TokenIdPercent); t.state = TokenizeStateSawPercent; @@ -500,12 +560,12 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawQuestionMark: switch (c) { case '?': - t.cur_tok->id = TokenIdDoubleQuestion; + set_token_id(&t, t.cur_tok, TokenIdDoubleQuestion); end_token(&t); t.state = TokenizeStateStart; break; case '=': - t.cur_tok->id = TokenIdMaybeAssign; + set_token_id(&t, t.cur_tok, TokenIdMaybeAssign); end_token(&t); t.state = TokenizeStateStart; break; @@ -520,7 +580,7 @@ void tokenize(Buf *buf, Tokenization *out) { switch (c) { case '.': t.state = TokenizeStateSawDotDot; - t.cur_tok->id = TokenIdEllipsis; + set_token_id(&t, t.cur_tok, TokenIdEllipsis); break; default: t.pos -= 1; @@ -542,12 +602,12 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawGreaterThan: switch (c) { case '=': - t.cur_tok->id = TokenIdCmpGreaterOrEq; + set_token_id(&t, t.cur_tok, TokenIdCmpGreaterOrEq); end_token(&t); t.state = TokenizeStateStart; break; case '>': - t.cur_tok->id = TokenIdBitShiftRight; + set_token_id(&t, t.cur_tok, TokenIdBitShiftRight); t.state = TokenizeStateSawGreaterThanGreaterThan; break; default: @@ -560,7 +620,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawGreaterThanGreaterThan: switch (c) { case '=': - t.cur_tok->id = TokenIdBitShiftRightEq; + set_token_id(&t, t.cur_tok, TokenIdBitShiftRightEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -574,12 +634,12 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawLessThan: switch (c) { case '=': - t.cur_tok->id = TokenIdCmpLessOrEq; + set_token_id(&t, t.cur_tok, TokenIdCmpLessOrEq); end_token(&t); t.state = TokenizeStateStart; break; case '<': - t.cur_tok->id = TokenIdBitShiftLeft; + set_token_id(&t, t.cur_tok, TokenIdBitShiftLeft); t.state = TokenizeStateSawLessThanLessThan; break; default: @@ -592,12 +652,12 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawLessThanLessThan: switch (c) { case '=': - t.cur_tok->id = TokenIdBitShiftLeftEq; + set_token_id(&t, t.cur_tok, TokenIdBitShiftLeftEq); end_token(&t); t.state = TokenizeStateStart; break; case '%': - t.cur_tok->id = TokenIdBitShiftLeftPercent; + set_token_id(&t, t.cur_tok, TokenIdBitShiftLeftPercent); t.state = TokenizeStateSawShiftLeftPercent; break; default: @@ -610,7 +670,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawShiftLeftPercent: switch (c) { case '=': - t.cur_tok->id = TokenIdBitShiftLeftPercentEq; + set_token_id(&t, t.cur_tok, TokenIdBitShiftLeftPercentEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -624,7 +684,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawBang: switch (c) { case '=': - t.cur_tok->id = TokenIdCmpNotEq; + set_token_id(&t, t.cur_tok, TokenIdCmpNotEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -638,12 +698,12 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawEq: switch (c) { case '=': - t.cur_tok->id = TokenIdCmpEq; + set_token_id(&t, t.cur_tok, TokenIdCmpEq); end_token(&t); t.state = TokenizeStateStart; break; case '>': - t.cur_tok->id = TokenIdFatArrow; + set_token_id(&t, t.cur_tok, TokenIdFatArrow); end_token(&t); t.state = TokenizeStateStart; break; @@ -657,17 +717,17 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawStar: switch (c) { case '=': - t.cur_tok->id = TokenIdTimesEq; + set_token_id(&t, t.cur_tok, TokenIdTimesEq); end_token(&t); t.state = TokenizeStateStart; break; case '*': - t.cur_tok->id = TokenIdStarStar; + set_token_id(&t, t.cur_tok, TokenIdStarStar); end_token(&t); t.state = TokenizeStateStart; break; case '%': - t.cur_tok->id = TokenIdTimesPercent; + set_token_id(&t, t.cur_tok, TokenIdTimesPercent); t.state = TokenizeStateSawStarPercent; break; default: @@ -680,7 +740,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawStarPercent: switch (c) { case '=': - t.cur_tok->id = TokenIdTimesPercentEq; + set_token_id(&t, t.cur_tok, TokenIdTimesPercentEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -694,17 +754,17 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawPercent: switch (c) { case '=': - t.cur_tok->id = TokenIdModEq; + set_token_id(&t, t.cur_tok, TokenIdModEq); end_token(&t); t.state = TokenizeStateStart; break; case '.': - t.cur_tok->id = TokenIdPercentDot; + set_token_id(&t, t.cur_tok, TokenIdPercentDot); end_token(&t); t.state = TokenizeStateStart; break; case '%': - t.cur_tok->id = TokenIdPercentPercent; + set_token_id(&t, t.cur_tok, TokenIdPercentPercent); end_token(&t); t.state = TokenizeStateStart; break; @@ -718,17 +778,17 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawPlus: switch (c) { case '=': - t.cur_tok->id = TokenIdPlusEq; + set_token_id(&t, t.cur_tok, TokenIdPlusEq); end_token(&t); t.state = TokenizeStateStart; break; case '+': - t.cur_tok->id = TokenIdPlusPlus; + set_token_id(&t, t.cur_tok, TokenIdPlusPlus); end_token(&t); t.state = TokenizeStateStart; break; case '%': - t.cur_tok->id = TokenIdPlusPercent; + set_token_id(&t, t.cur_tok, TokenIdPlusPercent); t.state = TokenizeStateSawPlusPercent; break; default: @@ -741,7 +801,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawPlusPercent: switch (c) { case '=': - t.cur_tok->id = TokenIdPlusPercentEq; + set_token_id(&t, t.cur_tok, TokenIdPlusPercentEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -755,11 +815,11 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawAmpersand: switch (c) { case '&': - t.cur_tok->id = TokenIdBoolAnd; + set_token_id(&t, t.cur_tok, TokenIdBoolAnd); t.state = TokenizeStateSawAmpersandAmpersand; break; case '=': - t.cur_tok->id = TokenIdBitAndEq; + set_token_id(&t, t.cur_tok, TokenIdBitAndEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -773,7 +833,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawAmpersandAmpersand: switch (c) { case '=': - t.cur_tok->id = TokenIdBoolAndEq; + set_token_id(&t, t.cur_tok, TokenIdBoolAndEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -787,7 +847,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawCaret: switch (c) { case '=': - t.cur_tok->id = TokenIdBitXorEq; + set_token_id(&t, t.cur_tok, TokenIdBitXorEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -801,11 +861,11 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawPipe: switch (c) { case '|': - t.cur_tok->id = TokenIdBoolOr; + set_token_id(&t, t.cur_tok, TokenIdBoolOr); t.state = TokenizeStateSawPipePipe; break; case '=': - t.cur_tok->id = TokenIdBitOrEq; + set_token_id(&t, t.cur_tok, TokenIdBitOrEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -819,7 +879,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawPipePipe: switch (c) { case '=': - t.cur_tok->id = TokenIdBoolOrEq; + set_token_id(&t, t.cur_tok, TokenIdBoolOrEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -837,7 +897,7 @@ void tokenize(Buf *buf, Tokenization *out) { t.state = TokenizeStateLineComment; break; case '=': - t.cur_tok->id = TokenIdDivEq; + set_token_id(&t, t.cur_tok, TokenIdDivEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -848,6 +908,51 @@ void tokenize(Buf *buf, Tokenization *out) { continue; } break; + case TokenizeStateSawBackslash: + switch (c) { + case '\\': + t.state = TokenizeStateLineString; + break; + default: + tokenize_error(&t, "invalid character: '%c'", c); + break; + } + break; + case TokenizeStateLineString: + switch (c) { + case '\n': + t.state = TokenizeStateLineStringEnd; + break; + default: + buf_append_char(&t.cur_tok->data.str_lit.str, c); + break; + } + break; + case TokenizeStateLineStringEnd: + switch (c) { + case WHITESPACE: + break; + case '\\': + t.state = TokenizeStateLineStringContinue; + break; + default: + t.pos -= 1; + end_token(&t); + t.state = TokenizeStateStart; + continue; + } + break; + case TokenizeStateLineStringContinue: + switch (c) { + case '\\': + t.state = TokenizeStateLineString; + buf_append_char(&t.cur_tok->data.str_lit.str, '\n'); + break; + default: + tokenize_error(&t, "invalid character: '%c'", c); + break; + } + break; case TokenizeStateLineComment: switch (c) { case '\n': @@ -858,31 +963,16 @@ void tokenize(Buf *buf, Tokenization *out) { break; } break; - case TokenizeStateSymbolFirst: + case TokenizeStateSymbolFirstC: switch (c) { case '"': - t.cur_tok->id = TokenIdStringLiteral; + set_token_id(&t, t.cur_tok, TokenIdStringLiteral); + t.cur_tok->data.str_lit.is_c_str = true; t.state = TokenizeStateString; break; case SYMBOL_CHAR: t.state = TokenizeStateSymbol; - break; - default: - t.pos -= 1; - end_token(&t); - t.state = TokenizeStateStart; - continue; - } - break; - case TokenizeStateSymbolFirstRaw: - switch (c) { - case '"': - t.cur_tok->id = TokenIdStringLiteral; - t.state = TokenizeStateRawString; - t.raw_string_id_start = t.pos + 1; - break; - case SYMBOL_CHAR: - t.state = TokenizeStateSymbol; + buf_append_char(&t.cur_tok->data.str_lit.str, c); break; default: t.pos -= 1; @@ -894,7 +984,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawAtSign: switch (c) { case '"': - t.cur_tok->id = TokenIdSymbol; + set_token_id(&t, t.cur_tok, TokenIdSymbol); t.state = TokenizeStateString; break; default: @@ -904,29 +994,10 @@ void tokenize(Buf *buf, Tokenization *out) { continue; } break; - case TokenizeStateFirstR: - switch (c) { - case '"': - t.cur_tok->id = TokenIdStringLiteral; - t.state = TokenizeStateRawString; - t.raw_string_id_start = t.pos + 1; - break; - case 'c': - t.state = TokenizeStateSymbolFirstRaw; - break; - case SYMBOL_CHAR_EXCEPT_C: - t.state = TokenizeStateSymbol; - break; - default: - t.pos -= 1; - end_token(&t); - t.state = TokenizeStateStart; - continue; - } - break; case TokenizeStateSymbol: switch (c) { case SYMBOL_CHAR: + buf_append_char(&t.cur_tok->data.str_lit.str, c); break; default: t.pos -= 1; @@ -942,108 +1013,124 @@ void tokenize(Buf *buf, Tokenization *out) { t.state = TokenizeStateStart; break; case '\n': - tokenize_error(&t, "use raw string for multiline string literal"); + tokenize_error(&t, "newline not allowed in string literal"); break; case '\\': t.state = TokenizeStateStringEscape; break; default: + buf_append_char(&t.cur_tok->data.str_lit.str, c); break; } break; case TokenizeStateStringEscape: switch (c) { case 'x': - t.state = TokenizeStateHex; - t.hex_chars_left = 2; + t.state = TokenizeStateCharCode; + t.radix = 16; + t.char_code = 0; + t.char_code_index = 0; + t.char_code_end = 2; + t.unicode = false; break; case 'u': - t.state = TokenizeStateHex; - t.hex_chars_left = 4; + t.state = TokenizeStateCharCode; + t.radix = 16; + t.char_code = 0; + t.char_code_index = 0; + t.char_code_end = 4; + t.unicode = true; break; case 'U': - t.state = TokenizeStateHex; - t.hex_chars_left = 6; + t.state = TokenizeStateCharCode; + t.radix = 16; + t.char_code = 0; + t.char_code_index = 0; + t.char_code_end = 6; + t.unicode = true; break; case 'n': + handle_string_escape(&t, '\n'); + break; case 'r': + handle_string_escape(&t, '\r'); + break; case '\\': + handle_string_escape(&t, '\\'); + break; case 't': + handle_string_escape(&t, '\t'); + break; case '\'': + handle_string_escape(&t, '\''); + break; case '"': - if (t.cur_tok->id == TokenIdCharLiteral) { - t.state = TokenizeStateCharLiteralEnd; - } else if (t.cur_tok->id == TokenIdStringLiteral) { - t.state = TokenizeStateString; - } else { - zig_unreachable(); - } + handle_string_escape(&t, '\"'); break; default: tokenize_error(&t, "invalid character: '%c'", c); } break; - case TokenizeStateHex: - switch (c) { - case HEX_DIGIT: - t.hex_chars_left -= 1; - if (t.hex_chars_left == 0) { - if (t.cur_tok->id == TokenIdCharLiteral) { - t.state = TokenizeStateCharLiteralEnd; - } else if (t.cur_tok->id == TokenIdStringLiteral) { - t.state = TokenizeStateString; - } else if (t.cur_tok->id == TokenIdSymbol) { - t.state = TokenizeStateString; - } else { - zig_unreachable(); - } - } - break; - default: - tokenize_error(&t, "invalid character: '%c'", c); - } - break; - case TokenizeStateRawString: - if (c == '(') { - t.raw_string_id_end = t.pos; - t.cur_tok->raw_string_start = t.pos + 1; - t.state = TokenizeStateRawStringContents; - } - break; - case TokenizeStateRawStringContents: - if (c == ')') { - t.state = TokenizeStateRawStringMaybeEnd; - t.raw_string_id_cmp_pos = t.raw_string_id_start; - t.cur_tok->raw_string_end = t.pos; - } - break; - case TokenizeStateRawStringMaybeEnd: - if (t.raw_string_id_cmp_pos >= t.raw_string_id_end && - c == '"') + case TokenizeStateCharCode: { - end_token(&t); - t.state = TokenizeStateStart; - } else if (c != buf_ptr(t.buf)[t.raw_string_id_cmp_pos]) { - if (c == ')') { - t.raw_string_id_cmp_pos = t.raw_string_id_start; - t.cur_tok->raw_string_end = t.pos; - } else { - t.state = TokenizeStateRawStringContents; + uint32_t digit_value = get_digit_value(c); + if (digit_value >= t.radix) { + tokenize_error(&t, "invalid digit: '%c'", c); + } + t.char_code *= t.radix; + t.char_code += digit_value; + t.char_code_index += 1; + + if (t.char_code_index >= t.char_code_end) { + if (t.unicode) { + if (t.char_code <= 0x7f) { + // 00000000 00000000 00000000 0xxxxxxx + handle_string_escape(&t, t.char_code); + } else if (t.cur_tok->id == TokenIdCharLiteral) { + tokenize_error(&t, "unicode value too large for character literal: %x", t.char_code); + } else if (t.char_code <= 0x7ff) { + // 00000000 00000000 00000xxx xx000000 + handle_string_escape(&t, 0xc0 | (t.char_code >> 6)); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, 0x80 | (t.char_code & 0x3f)); + } else if (t.char_code <= 0xffff) { + // 00000000 00000000 xxxx0000 00000000 + handle_string_escape(&t, 0xe0 | (t.char_code >> 12)); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, 0x80 | ((t.char_code >> 6) & 0x3f)); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, 0x80 | (t.char_code & 0x3f)); + } else if (t.char_code <= 0x10ffff) { + // 00000000 000xxx00 00000000 00000000 + handle_string_escape(&t, 0xf0 | (t.char_code >> 18)); + // 00000000 000000xx xxxx0000 00000000 + handle_string_escape(&t, 0x80 | ((t.char_code >> 12) & 0x3f)); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, 0x80 | ((t.char_code >> 6) & 0x3f)); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, 0x80 | (t.char_code & 0x3f)); + } else { + tokenize_error(&t, "unicode value out of range: %x", t.char_code); + } + } else { + if (t.cur_tok->id == TokenIdCharLiteral && t.char_code >= sizeof(uint8_t)) { + tokenize_error(&t, "value too large for character literal: '%x'", + t.char_code); + } + handle_string_escape(&t, t.char_code); + } } - } else { - t.raw_string_id_cmp_pos += 1; } break; case TokenizeStateCharLiteral: switch (c) { case '\'': - end_token(&t); - t.state = TokenizeStateStart; - break; + tokenize_error(&t, "expected character"); case '\\': t.state = TokenizeStateStringEscape; break; default: + t.cur_tok->data.char_lit.c = c; t.state = TokenizeStateCharLiteralEnd; break; } @@ -1061,15 +1148,17 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateZero: switch (c) { case 'b': - t.cur_tok->radix = 2; + t.radix = 2; t.state = TokenizeStateNumber; break; case 'o': - t.cur_tok->radix = 8; + t.radix = 8; + t.exp_add_amt = 3; t.state = TokenizeStateNumber; break; case 'x': - t.cur_tok->radix = 16; + t.radix = 16; + t.exp_add_amt = 4; t.state = TokenizeStateNumber; break; default: @@ -1082,113 +1171,127 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateNumber: { if (c == '.') { - if (t.pos + 1 < buf_len(t.buf)) { - uint8_t next_c = buf_ptr(t.buf)[t.pos + 1]; - if (next_c == '.') { - t.pos -= 1; - end_token(&t); - t.state = TokenizeStateStart; - continue; - } - } - t.cur_tok->decimal_point_pos = t.pos; - t.state = TokenizeStateFloatFraction; + t.state = TokenizeStateNumberDot; break; } - if (is_exponent_signifier(c, t.cur_tok->radix)) { - t.cur_tok->exponent_marker_pos = t.pos; + if (is_exponent_signifier(c, t.radix)) { t.state = TokenizeStateFloatExponentUnsigned; + t.is_num_lit_float = true; break; } - if (c == '_') { - tokenize_error(&t, "invalid character: '%c'", c); - break; - } - int digit_value = get_digit_value(c); - if (digit_value >= 0) { - if (digit_value >= t.cur_tok->radix) { + uint32_t digit_value = get_digit_value(c); + if (digit_value >= t.radix) { + if (is_symbol_char(c)) { tokenize_error(&t, "invalid character: '%c'", c); - break; } - // normal digit - } else { // not my char t.pos -= 1; end_token(&t); t.state = TokenizeStateStart; continue; } + t.cur_tok->data.num_lit.overflow = t.cur_tok->data.num_lit.overflow || + bignum_multiply_by_scalar(&t.cur_tok->data.num_lit.bignum, t.radix); + t.cur_tok->data.num_lit.overflow = t.cur_tok->data.num_lit.overflow || + bignum_increment_by_scalar(&t.cur_tok->data.num_lit.bignum, digit_value); break; } + case TokenizeStateNumberDot: + if (c == '.') { + t.pos -= 2; + end_token(&t); + t.state = TokenizeStateStart; + continue; + } + t.pos -= 1; + t.state = TokenizeStateFloatFraction; + t.is_num_lit_float = true; + continue; case TokenizeStateFloatFraction: { - if (is_exponent_signifier(c, t.cur_tok->radix)) { - t.cur_tok->exponent_marker_pos = t.pos; + if (is_exponent_signifier(c, t.radix)) { t.state = TokenizeStateFloatExponentUnsigned; break; } - if (c == '_') { - tokenize_error(&t, "invalid character: '%c'", c); - break; - } - int digit_value = get_digit_value(c); - if (digit_value >= 0) { - if (digit_value >= t.cur_tok->radix) { + uint32_t digit_value = get_digit_value(c); + if (digit_value >= t.radix) { + if (is_symbol_char(c)) { tokenize_error(&t, "invalid character: '%c'", c); - break; } - // normal digit - } else { // not my char t.pos -= 1; end_token(&t); t.state = TokenizeStateStart; continue; } + t.exponent_in_bin_or_dec -= t.exp_add_amt; + if (t.radix == 10) { + // For now we use strtod to parse decimal floats, so we just have to get to the + // end of the token. + break; + } + t.cur_tok->data.num_lit.overflow = t.cur_tok->data.num_lit.overflow || + bignum_multiply_by_scalar(&t.cur_tok->data.num_lit.bignum, t.radix); + t.cur_tok->data.num_lit.overflow = t.cur_tok->data.num_lit.overflow || + bignum_increment_by_scalar(&t.cur_tok->data.num_lit.bignum, digit_value); break; } case TokenizeStateFloatExponentUnsigned: switch (c) { case '+': + t.is_exp_negative = false; + t.state = TokenizeStateFloatExponentNumber; + break; case '-': + t.is_exp_negative = true; t.state = TokenizeStateFloatExponentNumber; break; default: // reinterpret as normal exponent number t.pos -= 1; + t.is_exp_negative = false; t.state = TokenizeStateFloatExponentNumber; continue; } break; case TokenizeStateFloatExponentNumber: - switch (c) { - case DIGIT: - break; - case ALPHA: - case '_': - tokenize_error(&t, "invalid character: '%c'", c); - break; - default: + { + uint32_t digit_value = get_digit_value(c); + if (digit_value >= t.radix) { + if (is_symbol_char(c)) { + tokenize_error(&t, "invalid character: '%c'", c); + } + // not my char t.pos -= 1; end_token(&t); t.state = TokenizeStateStart; continue; + } + if (t.radix == 10) { + // For now we use strtod to parse decimal floats, so we just have to get to the + // end of the token. + break; + } + t.cur_tok->data.num_lit.overflow = t.cur_tok->data.num_lit.overflow || + bignum_multiply_by_scalar(&t.specified_exponent, 10); + t.cur_tok->data.num_lit.overflow = t.cur_tok->data.num_lit.overflow || + bignum_increment_by_scalar(&t.specified_exponent, digit_value); } break; case TokenizeStateSawDash: switch (c) { case '>': - t.cur_tok->id = TokenIdArrow; + set_token_id(&t, t.cur_tok, TokenIdArrow); end_token(&t); t.state = TokenizeStateStart; break; case '=': - t.cur_tok->id = TokenIdMinusEq; + set_token_id(&t, t.cur_tok, TokenIdMinusEq); end_token(&t); t.state = TokenizeStateStart; break; case '%': - t.cur_tok->id = TokenIdMinusPercent; + set_token_id(&t, t.cur_tok, TokenIdMinusPercent); t.state = TokenizeStateSawMinusPercent; break; default: @@ -1201,7 +1304,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawMinusPercent: switch (c) { case '=': - t.cur_tok->id = TokenIdMinusPercentEq; + set_token_id(&t, t.cur_tok, TokenIdMinusPercentEq); end_token(&t); t.state = TokenizeStateStart; break; @@ -1226,11 +1329,14 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateStart: case TokenizeStateError: break; + case TokenizeStateNumberDot: + tokenize_error(&t, "unterminated number literal"); + break; case TokenizeStateString: tokenize_error(&t, "unterminated string"); break; case TokenizeStateStringEscape: - case TokenizeStateHex: + case TokenizeStateCharCode: if (t.cur_tok->id == TokenIdStringLiteral) { tokenize_error(&t, "unterminated string"); } else if (t.cur_tok->id == TokenIdCharLiteral) { @@ -1239,19 +1345,12 @@ void tokenize(Buf *buf, Tokenization *out) { zig_unreachable(); } break; - case TokenizeStateRawString: - case TokenizeStateRawStringContents: - case TokenizeStateRawStringMaybeEnd: - tokenize_error(&t, "unterminated raw string"); - break; case TokenizeStateCharLiteral: case TokenizeStateCharLiteralEnd: tokenize_error(&t, "unterminated character literal"); break; case TokenizeStateSymbol: - case TokenizeStateSymbolFirst: - case TokenizeStateSymbolFirstRaw: - case TokenizeStateFirstR: + case TokenizeStateSymbolFirstC: case TokenizeStateZero: case TokenizeStateNumber: case TokenizeStateFloatFraction: @@ -1280,9 +1379,13 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateSawPlusPercent: case TokenizeStateSawMinusPercent: case TokenizeStateSawShiftLeftPercent: + case TokenizeStateLineString: + case TokenizeStateLineStringEnd: end_token(&t); break; case TokenizeStateSawDotDot: + case TokenizeStateSawBackslash: + case TokenizeStateLineStringContinue: tokenize_error(&t, "unexpected EOF"); break; case TokenizeStateLineComment: diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 44ed8a14dd..49e88c5cc8 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -9,6 +9,7 @@ #define ZIG_TOKENIZER_HPP #include "buffer.hpp" +#include "bignum.hpp" enum TokenId { TokenIdEof, @@ -111,6 +112,22 @@ enum TokenId { TokenIdPercentDot, }; +struct TokenNumLit { + BigNum bignum; + // overflow is true if when parsing the number, we discovered it would not + // fit without losing data in a uint64_t or double + bool overflow; +}; + +struct TokenStrLit { + Buf str; + bool is_c_str; +}; + +struct TokenCharLit { + uint8_t c; +}; + struct Token { TokenId id; int start_pos; @@ -118,14 +135,16 @@ struct Token { int start_line; int start_column; - // for id == TokenIdNumberLiteral - int radix; // if != 10, then skip the first 2 characters - int decimal_point_pos; // either exponent_marker_pos or the position of the '.' - int exponent_marker_pos; // either end_pos or the position of the 'e'/'p' + union { + // TokenIdNumberLiteral + TokenNumLit num_lit; - // for id == TokenIdStringLiteral - int raw_string_start; - int raw_string_end; + // TokenIdStringLiteral or TokenIdSymbol + TokenStrLit str_lit; + + // TokenIdCharLiteral + TokenCharLit char_lit; + } data; }; struct Tokenization { @@ -142,8 +161,6 @@ void tokenize(Buf *buf, Tokenization *out_tokenization); void print_tokens(Buf *buf, ZigList *tokens); -int get_digit_value(uint8_t c); - const char * token_name(TokenId id); bool valid_symbol_starter(uint8_t c); diff --git a/test/run_tests.cpp b/test/run_tests.cpp index d03d8d0d52..7941f40ab1 100644 --- a/test/run_tests.cpp +++ b/test/run_tests.cpp @@ -1173,7 +1173,7 @@ fn f() { add_compile_fail_case("normal string with newline", R"SOURCE( const foo = "a b"; - )SOURCE", 1, ".tmp_source.zig:2:13: error: use raw string for multiline string literal"); + )SOURCE", 1, ".tmp_source.zig:2:13: error: newline not allowed in string literal"); add_compile_fail_case("invalid comparison for function pointers", R"SOURCE( fn foo() {} @@ -1760,7 +1760,7 @@ struct type { )SOURCE", 3, R"(pub const FOO = c"aoeu\x13 derp")", R"(pub const FOO2 = c"aoeu\x134 derp")", - R"(pub const FOO_CHAR = '\x3f')"); + R"(pub const FOO_CHAR = '?')"); } static void run_self_hosted_test(bool is_release_mode) { diff --git a/test/self_hosted.zig b/test/self_hosted.zig index bb2eec814a..6c9fd2d5f5 100644 --- a/test/self_hosted.zig +++ b/test/self_hosted.zig @@ -684,17 +684,13 @@ fn count_trailing_zeroes() { #attribute("test") fn multiline_string() { - const s1 = r"AOEU( -one -two) -three)AOEU"; - const s2 = "\none\ntwo)\nthree"; - const s3 = r"( -one -two) -three)"; + const s1 = + \\one + \\two) + \\three + ; + const s2 = "one\ntwo)\nthree"; assert(str.eql(s1, s2)); - assert(str.eql(s3, s2)); }