1 module hjson.parser;
2 
3 public import std.json : JSONValue;
4 
5 import std.algorithm;
6 import std.array;
7 import std.ascii : isDecimalDigit = isDigit, isHexDigit;
8 import std.conv : ConvException, to;
9 import std.exception : _enforce = enforce, basicExceptionCtors;
10 import std.format : format;
11 import std.range;
12 import std.typecons : Flag;
13 import std.uni;
14 
15 import hjson.adapter;
16 
17 /** Parses a Hjson value into a `JSONValue` object.
18     Params:
19         hjson = string containing the Hjson.
20     Throws:
21         `HjsonException` when passed invalid Hjson.
22     Returns:
23         Parsed JSONValue.
24 */
25 JSONValue parseHjson(string hjson)
26 {
27     JSONValue result;
28     scope consumer = StdJsonSerializer(&result);
29     hjson.parseHjson(consumer);
30     return result;
31 }
32 
33 /** Parses a Hjson value and feeds the parsed tokens into given consumer.
34     This allows for parsing into representations other than std.json.JSONValue.
35     Params:
36         hjson = string containing the Hjson.
37         consumer = Object responsible for processing the parsed tokens.
38     Throws:
39         `HjsonException` when passed invalid Hjson.
40 */
41 void parseHjson(Consumer)(string hjson, ref Consumer consumer)
42 {
43     size_t collumn = 0;
44     hjson.parseValue(collumn,consumer);
45 }
46 
47 ///
48 class HjsonException : Exception
49 {
50     mixin basicExceptionCtors;
51 }
52 
53 /** Parses a single Hjson value (object, array, string, number, bool or null).
54     Params:
55         hjson = string being parsed. May contain leading whitespace. 
56                 The beginning of the string until the end of the parsed 
57                 Hjson value is consumed.
58         collumn = How many `dchar`s were popped from the front of `hjson` since
59                   last line feed. Will be updated by the function. 
60                   Needed to properly parse multiline strings.
61         consumer = Object responsible for processing the parsed tokens.
62     Throws: 
63     `HjsonException` if `hjson` starts with an invalid Hjson value.
64     Invalid Hjson past the first valid value is not detected.
65 */
66 void parseValue(Consumer)(ref string hjson, ref size_t collumn, ref Consumer consumer)
67 {
68     hjson.skipWC(collumn);
69     enforce(!hjson.empty, "Expected a value before EOF.");
70     
71     if(hjson.front == '{')
72     {
73         auto state = consumer.objectBegin();
74         hjson.parseAggregate!('{', '}', parseObjectMember)(collumn, consumer);
75         consumer.objectEnd(state);
76     }
77     else if(hjson.front == '[')
78     {
79         auto state = consumer.arrayBegin();
80         hjson.parseAggregate!('[', ']', (ref hjs, ref col, ref ser){
81             ser.elemBegin;
82             hjs.parseValue(col,ser);
83         })(collumn,consumer);
84         consumer.arrayEnd(state);
85     }
86     else if(hjson.save.startsWith("true"))
87     {
88         if(!hjson.tryParseBuiltin(collumn,true,"true",consumer))
89             consumer.putValue(hjson.parseQuotelessString());
90     }
91     else if(hjson.save.startsWith("false"))
92     {
93         if(!hjson.tryParseBuiltin(collumn,false,"false",consumer))
94             consumer.putValue(hjson.parseQuotelessString());
95     }
96     else if(hjson.save.startsWith("null"))
97     {
98         if(!hjson.tryParseBuiltin(collumn,null,"null",consumer))
99             consumer.putValue(hjson.parseQuotelessString());
100     }
101     else if(hjson.front == '"')
102         consumer.putValue(hjson.parseJSONString(collumn));
103     else if(hjson.front == '\'')
104     {
105         auto r = hjson.save;
106         if(r.startsWith("'''"))
107             consumer.putValue(hjson.parseMultilineString(collumn));
108         else
109             consumer.putValue(hjson.parseJSONString(collumn));
110     }
111     else if(!hjson.front.isPunctuator)
112     {
113         if(!hjson.tryParseNumber(consumer))
114             consumer.putValue(hjson.parseQuotelessString());
115     }
116     else throw new HjsonException("Invalid Hjson.");
117 }
118 
119 /** Parses a single Hjson object or array.
120     Params:
121         hjson = string being parsed. Must not contain leading whitespace. 
122                 The beginning of the string until the end of the parsed 
123                 Hjson value is consumed.
124         collumn = How many `dchar`s were popped from the front of `hjson` since
125                   last line feed. Will be updated by the function. 
126                   Needed to properly parse multiline strings.
127         consumer = Object responsible for processing the parsed tokens.
128         start = Token which marks the beginning of parsed aggregate ('[' for array, '{' for object)
129         end = Token which marks the end of parsed aggregate (']' for array, '}' for object)
130         parseMember = Function used to parse a single aggregate member. Parameters are
131                       the same as `parseAggregate`. Hjson passed to `parseMember` contains
132                       no leading whitespace.
133     Throws: 
134     `HjsonException` if `hjson` starts with an invalid Hjson value.
135     Invalid Hjson past the first valid value is not detected.
136 */
137 void parseAggregate
138     (dchar start, dchar end, alias parseMember, Consumer)
139     (ref string hjson, ref size_t collumn, ref Consumer consumer)
140 in(!hjson.empty)
141 in(hjson.front == start)
142 {
143     // Get rid of opening '{' and whitespace
144     hjson.popFront();
145     ++collumn;
146     hjson.skipWC(collumn);
147 
148     //Handle empty Hjson object {whitespace/comments only}
149     enforce(!hjson.empty, "Expected member or '%s' before EOF.".format(end));
150     if(hjson.front == end)
151     {
152         hjson.popFront();
153         ++collumn;
154         return;
155     }
156 
157     //Now we know we have at least one member
158     parseMember(hjson, collumn, consumer);
159 
160     while(true)
161     {
162         // Skip member separator
163         bool gotMemberSeparator = hjson.skipWC(collumn);
164         enforce(!hjson.empty);
165         if(hjson.front == ',')
166         {
167             hjson.popFront();
168             ++collumn;
169             gotMemberSeparator = true;
170             hjson.skipWC(collumn);
171             enforce(!hjson.empty);
172         }
173 
174         if(hjson.front == end)
175         {
176             hjson.popFront();
177             ++collumn;
178             return;
179         }
180         else
181         {
182             enforce(gotMemberSeparator, hjson);
183             parseMember(hjson, collumn, consumer);
184         }
185     }
186     assert(0, "Shouldn't get there");
187 }
188 
189 /** In JSON you can determine the type of the parsed value by looking at just their
190     first character. In Hjson if you follow a valid JSON number/bool/null with certain other
191     characters it will turn into a quoteless string. This function checks whether parsed
192     value turns into a quoteless string by looking at the following characters.
193 
194     Params:
195         sufix = Hjson following the previously parsed value.
196     Returns: Whether `sufix` turns the preceding Hjson number/bool/null into a quoteless string.
197 */
198 bool turnsIntoQuotelessString(string sufix)
199 {
200     if(
201         !sufix.empty &&
202         sufix.front != ',' &&
203         sufix.front != ']' &&
204         sufix.front != '}' &&
205         sufix.front != '\n'
206     ) {
207         // If there is a comment-starting token NOT SEPARATED BY WHITESPACE
208         // then we treat the entire thing as quoteless string
209         // 1234#notcomment is a quoteless string
210         // 1234 #comment is a number and a comment
211         foreach(commentStart; ["//", "/*", "#"])
212             if(sufix.save.startsWith(commentStart))
213                 return true;
214         
215         if(sufix.front.isWhite)
216         {
217             // We have whitespace after the number, but there is a non-punctuator token before
218             // the end of the line, so it's a quoteless string
219             size_t dummyCollumn;
220             if(
221                 !skipWC(sufix, dummyCollumn) && 
222                 !sufix.empty && 
223                 sufix.front != ',' &&
224                 sufix.front != ']' &&
225                 sufix.front != '}'
226             ) return true;
227         }
228         else return true; //number followed by non-whitespace, non-comma char -> quoteless string
229     }
230     return false;
231 }
232 
233 /** Attempts to parse a builtin constant.
234     Params:
235         hjson = string being parsed. Must not contain leading whitespace. 
236                 The beginning of the string until the end of the parsed 
237                 Hjson value is consumed if and only if the constant was
238                 succesfully parsed.
239         collumn = How many `dchar`s were popped from the front of `hjson` since
240                   last line feed. Will be updated by the function. 
241                   Needed to properly parse multiline strings.
242         value = Value of the constant.
243         repr = How the constant is represented in Hjson.
244         consumer = Object responsible for processing the parsed tokens.
245     Throws: 
246     `HjsonException` if `hjson` starts with an invalid Hjson value.
247     Invalid Hjson past the first valid value is not detected.
248     Returns: 
249     `true` if parsing the constant succeeds, `false` if the value was actually a quoteless string.
250 */
251 bool tryParseBuiltin(T,Consumer)(ref string hjson, ref size_t collumn, T value, string repr, ref Consumer consumer)
252 {
253     auto sufix = hjson[repr.length..$];
254     if(turnsIntoQuotelessString(sufix)) return false;
255     else 
256     {
257         consumer.putValue(value);
258         hjson = sufix;
259         collumn += repr.walkLength;
260         return true;
261     }
262 }
263 
264 /** Attempts to parse a Hjson number.
265     Params:
266         hjson = string being parsed. Must not contain leading whitespace. 
267                 The beginning of the string until the end of the parsed 
268                 Hjson value is consumed if and only if the number was
269                 succesfully parsed.
270         consumer = Object responsible for processing the parsed tokens.
271     Throws: 
272     `HjsonException` if `hjson` starts with an invalid Hjson value.
273     Invalid Hjson past the first valid value is not detected.
274     Returns: 
275     `true` if parsing the number succeeds, `false` if the value was actually a quoteless string.
276 */
277 bool tryParseNumber(Consumer)(ref string hjson, ref Consumer consumer)
278 {
279     size_t i=0;
280     bool parseAsDouble = false;
281 
282     // Optional preceding -
283     if(hjson.front == '-') ++i;
284     if(i >= hjson.length) 
285         return false;
286 
287     // Integer part
288     if(hjson[i] == '0') ++i;
289     else if(hjson[i].isDecimalDigit)
290         // Don't use countUntil because it returns -1 if no value 
291         // in the range satisfies the condition
292         i += hjson[i..$].until!(x => !x.isDecimalDigit).walkLength;
293     else return false;
294 
295     // Fractional part
296     if(i < hjson.length && hjson[i] == '.')
297     {
298         ++i;
299         if(i >= hjson.length)
300             return false;
301         if(hjson[i].isDecimalDigit)
302             i += hjson[i..$].until!(x => !x.isDecimalDigit).walkLength;
303         else return false;
304         parseAsDouble = true;
305     }
306 
307     // Exponent part
308     if(i < hjson.length && hjson[i].toLower == 'e')
309     {
310         ++i;
311         if(i >= hjson.length) 
312             return false;
313         if(hjson[i] == '+' || hjson[i] == '-')
314         {
315             ++i;
316             if(i >= hjson.length)
317                 return false;
318         }
319         if(hjson[i].isDecimalDigit)
320             i += hjson[i..$].until!(x => !x.isDecimalDigit).walkLength;
321         else return false;
322         parseAsDouble = true;
323     }
324     
325     if(turnsIntoQuotelessString(hjson[i..$]))
326         return false;
327 
328     if(!parseAsDouble)
329         try consumer.putValue(hjson[0..i].to!long);
330         catch(ConvException) 
331             parseAsDouble = true;
332 
333     if(parseAsDouble)
334         consumer.putValue(hjson[0..i].to!double);
335 
336     hjson.popFrontN(i);
337     return true;
338 }
339 
340 /** Parses a Hjson quoteless string.
341     Params:
342         hjson = Hjson being parsed. Must not contain leading whitespace. 
343                 The beginning of the Hjson until the end of the parsed 
344                 Hjson value is consumed.
345     Throws: 
346     `HjsonException` if `hjson` starts with an invalid Hjson value.
347     Invalid Hjson past the first valid value is not detected.
348     Returns: The parsed string.
349 */
350 string parseQuotelessString(ref string hjson)
351 in(!hjson.empty)
352 {
353     auto s = hjson.findSplitBefore("\n");
354     hjson = s[1];
355     auto result = s[0].stripRight!isWhite;
356     assert(!result.empty);
357     return result;
358 }
359 
360 /** Parses a Hjson JSON-string.
361     Params:
362         hjson = Hjson being parsed. Must not contain leading whitespace. 
363                 The beginning of the Hjson until the end of the parsed 
364                 Hjson value is consumed.
365         collumn = How many `dchar`s were popped from the front of `hjson` since
366                 last line feed. Will be updated by the function. 
367                 Needed to properly parse multiline strings.
368     Throws: 
369     `HjsonException` if `hjson` starts with an invalid Hjson value.
370     Invalid Hjson past the first valid value is not detected.
371     Returns: The parsed string.
372 */
373 string parseJSONString(ref string hjson, ref size_t collumn)
374 in(!hjson.empty)
375 in(hjson.front == '"' || hjson.front == '\'')
376 {
377     immutable terminator = hjson.front;
378     hjson.popFront();
379     ++collumn;
380 
381     string result;
382 
383     while(!hjson.empty)
384     {
385         immutable c = hjson.front;
386         hjson.popFront;
387         ++collumn;
388 
389         if(c == '\n') collumn = 0;
390 
391         if(c == terminator)
392         {
393             return result;
394         }
395         else if(c == '\\')
396         {
397             enforce(!hjson.empty, "Incomplete escape sequence.");
398             immutable d = hjson.front;
399             hjson.popFront;
400             ++collumn;
401             switch(d)
402             {
403                 case '"', '\'', '\\', '/': result ~= d; break;
404 
405                 case 'b': result ~= '\b'; break;
406                 case 'f': result ~= '\f'; break;
407                 case 'n': result ~= '\n'; break;
408                 case 'r': result ~= '\r'; break;
409                 case 't': result ~= '\t'; break;
410 
411                 case 'u': 
412                     enforce(hjson.length >= 4, "Incomplete Unicode escape sequence.");
413                     auto code = hjson[0..4];
414                     enforce(code.all!isHexDigit, "Invalid Unicode escape sequence.");
415                     result ~= cast(wchar) code.to!uint(16);
416                     hjson.popFrontN(4);
417                     collumn += 4;
418                 break;
419 
420                 default: throw new HjsonException("Invalid escape sequence: \\%s".format(d));
421             }
422         }
423         else result ~= c;
424     }
425     throw new HjsonException("Unterminated string literal.");
426 }
427 
428 /** Parses a Hjson multiline string.
429     Params:
430         hjson = Hjson being parsed. Must not contain leading whitespace. 
431                 The beginning of the Hjson until the end of the parsed 
432                 Hjson value is consumed.
433         collumn = How many `dchar`s were popped from the front of `hjson` since
434                 last line feed.
435     Throws: 
436     `HjsonException` if `hjson` starts with an invalid Hjson value.
437     Invalid Hjson past the first valid value is not detected.
438     Returns: The parsed string.
439 */
440 string parseMultilineString(ref string hjson, immutable size_t collumn)
441 in(!hjson.empty)
442 in(hjson.save.startsWith("'''"))
443 {
444     hjson.popFrontN(3);
445     auto s = hjson.findSplit("'''");
446     enforce(s[1] == "'''", "Unterminated multiline string (missing ''').");
447     hjson = s[2];
448     auto str = s[0];
449 
450     //If line with opening ''' contains only whitespace, ignore that whitespace
451     auto prefixWhitespace = str.save.until!(x => !x.isWhite);
452     if(prefixWhitespace.canFind('\n'))
453         str = str.find('\n')[1..$];
454 
455     //Unindent
456     string result;
457     size_t ignoreWhitespace = collumn;
458     foreach(x; str)
459         if(x == '\n') 
460         {
461             ignoreWhitespace = collumn;
462             result ~= x;
463         }
464         else if(x.isWhite && ignoreWhitespace > 0)
465             --ignoreWhitespace;
466         else 
467         {
468             ignoreWhitespace = 0;
469             result ~= x;
470         }
471 
472     // If sufix whitespace contains LF: remove it and all whitespace afterwards
473     auto trailingWhitespace = result.retro.until!(x => !x.isWhite);
474     if(trailingWhitespace.save.canFind('\n'))
475         result.length = result.length - trailingWhitespace.countUntil('\n') - 1;
476 
477     return result;
478 }
479 
480 /** Parses a single object member.
481     Params:
482         hjson = Hjson being parsed. Must not contain leading whitespace. 
483                 The beginning of the Hjson until the end of the parsed 
484                 Hjson object member is consumed.
485         collumn = How many `dchar`s were popped from the front of `hjson` since
486                 last line feed. Will be updated by the function. 
487                 Needed to properly parse multiline strings.
488         consumer = Object responsible for processing the parsed tokens.
489     Throws: 
490     `HjsonException` if `hjson` starts with an invalid Hjson object member.
491     Invalid Hjson past the first valid object member is not detected.
492 */
493 void parseObjectMember(Consumer)(ref string hjson, ref size_t collumn, ref Consumer consumer)
494 {
495     // Parse the key
496     string key;
497     enforce(!isPunctuator(hjson.front), "Expected Hjson member but got punctuator.");
498     if(hjson.front == '"' || hjson.front == '\'') 
499         key = hjson.parseJSONString(collumn);
500     else {
501         size_t keyLength = 0;
502         while(
503             keyLength < hjson.length && 
504             !hjson[keyLength].isPunctuator && 
505             !hjson[keyLength].isWhite
506         ) ++keyLength;
507         key = hjson[0..keyLength];
508         hjson.popFrontN(keyLength);
509         collumn += keyLength;
510     }
511 
512     // Get rid of ':'
513     hjson.skipWC(collumn);
514     enforce(!hjson.empty);
515     enforce(hjson.front == ':', "Expected ':'");
516     hjson.popFront();
517     ++collumn;
518 
519     // Parse the value
520     hjson.skipWC(collumn);
521     enforce(!hjson.empty);
522 
523     consumer.putKey(key);
524     hjson.parseValue(collumn, consumer);
525 }
526 
527 /** Consumes all whitespace and comments from the front of the passed Hjson.
528     Params:
529         hjson = Hjson from which whitespace and comments should be consumed.
530         collumn = How many `dchar`s were popped from the front of `hjson` since
531                 last line feed. Will be updated by the function. 
532                 Needed to properly parse multiline strings.
533     Throws:
534     HjsonException if a block comment is not terminated before the end of the string.
535     Returns:
536     `true` if a line feed was skipped, `false` otherwise. This is needed because
537     line feeds can be used to separate aggregate members similar to commas.
538 */
539 bool skipWC(ref string hjson, ref size_t collumn)
540 {
541     bool skippedLF = false;
542 
543     while(!hjson.empty)
544     {
545         bool finished = true;
546 
547         //Whitespace
548         while(!hjson.empty && hjson.front.isWhite)
549         {
550             if(hjson.front == '\n')
551             {
552                 skippedLF = true;
553                 collumn = 0;
554             }
555             else ++collumn;
556             hjson.popFront;
557             finished = false;
558         }
559         //Comments
560         if(!hjson.empty)
561         {
562             if(hjson.front == '#' || hjson.save.startsWith("//")) 
563             {
564                 hjson = hjson.find('\n');
565                 collumn = 0;
566                 finished = false;
567             }
568             else if(hjson.save.startsWith("/*"))
569             {
570                 hjson.popFrontN(2);
571                 while(!hjson.save.startsWith("*/"))
572                 {
573                     enforce(!hjson.empty, "Unterminated block comment (missing */)");
574                     if(hjson.front == '\n') collumn = 0;
575                     else ++collumn;
576                     hjson.popFront;
577                 }
578                 hjson.popFrontN(2);
579                 collumn += 2;
580                 finished = false;
581             }
582         }
583         if(finished) break;
584     }
585     return skippedLF;
586 }
587 
588 /*@("skipWC") unittest
589 {
590     string text = "  \t \r  ";
591     assert(!skipWC(text));
592     assert(text.empty);
593     
594     text = "    \t  hello";
595     assert(!skipWC(text));
596     assert(text == "hello");
597     
598     text = "  \n  ";
599     assert(skipWC(text));
600     assert(text.empty);
601 }*/
602 
603 alias enforce = _enforce!HjsonException;
604 
605 /** Checks whether given `dchar` is a Hjson punctuator.
606     Hjson quoteless strings may not start with a punctuator,
607     and quoteless object keys may not contain any punctuators.
608 */
609 bool isPunctuator(dchar c)
610 {
611     return "{}[],:"d.canFind(c);
612 }
613 
614 version(unittest):
615 version(Have_unit_threaded):
616 
617 import std.format : format;
618 import std.json : parseJSON;
619 import std.range : chain, only, iota;
620 
621 import unit_threaded;
622 
623 static foreach(testName; [
624     "charset",
625     "charset2",
626     "comments",
627     "empty",
628     "kan",
629     "keys",
630     "oa",
631     "passSingle",
632     "stringify1",
633     "strings",
634     "strings2",
635     "trail"
636 ]) {
637     @testName unittest 
638     {
639         immutable json = import(testName~"_result.json"),
640             hjsonResult = import(testName~"_result.hjson"),
641             hjsonTest = import(testName~"_test.hjson");
642 
643         hjsonTest.parseHjson.should == json.parseJSON;
644         hjsonResult.parseHjson.should == json.parseJSON;
645         json.parseHjson.should == json.parseJSON;
646     }
647 }
648 static foreach(testName; [
649     "mltabs",
650     "pass1",
651     "pass2",
652     "pass3",
653     "pass4"
654 ]) {
655     @testName unittest 
656     {
657         immutable hjson = import(testName~"_result.hjson"),
658             json = import(testName~"_result.json");
659 
660         hjson.parseHjson.should == json.parseJSON;
661         json.parseHjson.should == json.parseJSON;
662     }
663 }
664 
665 static foreach(failNr; chain(
666     only(2),
667     iota(5,7),
668     iota(11,18),
669     iota(19,24),
670     only(26),
671     iota(28,34)
672 )) {
673     @Tags("invalid_input")
674     @format("failJSON%d", failNr) unittest 
675     {
676         immutable json = import("failJSON%02d_test.json".format(failNr));
677         json.parseHjson.shouldThrow!HjsonException;
678     }
679 }
680 
681 static foreach(failNr; [7,8,10,34])
682 {
683     @Tags("invalid_input")
684     @ShouldFail("Hjson-d does not attempt to validate the rest of input after parsing a valid Hjson value.")
685     @format("failJSON%d", failNr) unittest 
686     {
687         immutable json = import("failJSON%02d_test.json".format(failNr));
688         json.parseHjson.shouldThrow!HjsonException;
689     }
690 }