hunt.xml.DocumentParser source code

1 module hunt.xml.DocumentParser;
2 
3 import hunt.xml.Attribute;
4 import hunt.xml.Common;
5 import hunt.xml.Document;
6 import hunt.xml.Element;
7 import hunt.xml.Node;
8 import hunt.xml.Internal;
9 
10 import hunt.logging.ConsoleLogger;
11 
12 /** 
13  * 
14  */
15 class DocumentParser {
16 
17     /** 
18      * 
19      * Params:
20      *   stext = 
21      *   null = 
22      * Returns: 
23      */
24     static Document parse(ParsingFlags Flags = ParsingFlags.Default)(string stext , Document parent = null)
25     {
26         Document document = new Document();
27         // document.removeAllNodes();
28         // document.removeAllAttributes();
29         document.setParent(parent ? parent.firstNode() : null);
30         char[] text = cast(char[])stext.dup;
31 
32         parseBom(text);
33 
34         size_t index = 0;
35         size_t length = text.length;
36         while(1)
37         {
38             skip!(WhitespacePred)(text); 
39             if(text.length == 0)
40                 break;
41             if(text[index] =='<')
42             {
43                 ++index;
44                 text = text[index .. $];
45                 Element  node = parseNode!(Flags)(text);
46                 if(node !is null)
47                 {
48                     document.appendNode(node);
49                     if(Flags & (ParsingFlags.OpenOnly | ParsingFlags.ParseOne))
50                     {
51                         if(node.getType()  == NodeType.Comment)
52                             break;
53                     }
54                 }
55                 index=0;
56             }
57             else
58                 throw new XmlParsingException("expected <", text);
59         }
60 
61         if(!document.firstNode())
62             throw new XmlParsingException("no root element", text[index .. $ ]);
63 
64         return document;
65     }
66 
67     static private Element parseNode(int Flags)(ref char[] text)
68     {
69         switch(text[0])
70         {
71             // <...
72             default:
73                 return parseElement!Flags(text);
74 
75             // <?...
76             case '?':
77                 text = text[1 .. $ ];
78                 if(((text[0] == 'x' ) || (text[0] == 'X')) &&
79                 ((text[1] == 'm' ) || (text[1] == 'M')) &&
80                 ((text[2] == 'l' ) || (text[2] == 'L')) &&
81                 WhitespacePred.test(text[3]))
82                 {
83                     text = text[4 .. $];
84                     return parseXmlDeclaration!Flags(text);
85                 }
86                 else
87                 {
88                     return parsePI!Flags(text);
89                 }
90 
91             case '!':
92                 switch(text[1])
93                 {
94                 case '-':
95                     if(text[2] == '-')
96                     {
97                         text = text[3 .. $ ];
98                         return parseComment!Flags(text);
99                     } 
100                     break;
101                 case ('['):
102                     if (text[2] == ('C') && text[3] == ('D') && text[4] == ('A') &&
103                         text[5] == ('T') && text[6] == ('A') && text[7] == ('['))
104                     {
105                         // '<![CDATA[' - cdata
106                         text = text[8 .. $ ];     // Skip '![CDATA['
107                         return parseCdata!Flags(text);
108                     }
109                     break;
110 
111                 // <!D
112                 case ('D'):
113                     if (text[2] == ('O') && text[3] == ('C') && text[4] == ('T') &&
114                         text[5] == ('Y') && text[6] == ('P') && text[7] == ('E') &&
115                         WhitespacePred.test(text[8]))
116                     {
117                         // '<!DOCTYPE ' - doctype
118                         text = text[9 .. $ ];      // skip '!DOCTYPE '
119                         return parseDoctype!Flags(text);
120                     }
121                     break;
122                 default:
123                     break;
124 
125                 } 
126 
127                  text = text[1 .. $ ];     // Skip !
128                 while (text[0] != ('>'))
129                 {
130                     if (text == null)
131                         throw new XmlParsingException("unexpected end of data", text);
132                     text = text[1 .. $ ];
133                 }
134                 text = text[1 .. $ ];     // Skip '>'
135                 return null;   // No node recognized
136 
137         }
138     }
139 
140     static private Element parseCdata(int Flags)(ref char[] text)
141     {
142         // If CDATA is disabled
143         if (Flags & ParsingFlags.DataNodes)
144         {
145             // Skip until end of cdata
146             while (text[0] != ']' || text[1] != ']' || text[2] != '>')
147             {
148                 if (!text[0])
149                     throw new XmlParsingException("unexpected end of data", text);
150                 text = text[1 .. $];
151             }
152             text = text[3 .. $];      // Skip ]]>
153             return null;       // Do not produce CDATA node
154         }
155 
156         // Skip until end of cdata
157         char[] value = text;
158         while (text[0] != (']') || text[1] != (']') || text[2] != ('>'))
159         {
160             if (!text[0])
161                 throw new XmlParsingException("unexpected end of data", text);
162             text = text[1 .. $ ];
163         }
164 
165         // Create new cdata node
166         Element cdata = new Element(NodeType.CDATA);
167         cdata.setText = cast(string)value[ 0 .. value.length - text.length].dup;
168 
169         // Place zero terminator after value
170 
171         text = text[3 .. $ ];      // Skip ]]>
172         return cdata;
173     }
174 
175     static private char parseAndAppendData(int Flags)(Element node, ref char []text, char[] contents_start)
176     {
177         // Backup to contents start if whitespace trimming is disabled
178         if (!(Flags & ParsingFlags.TrimWhitespace))
179             text = contents_start;
180 
181         // Skip until end of data
182         char [] value = text;
183         char []end;
184         if (Flags & ParsingFlags.NormalizeWhitespace)
185             end = skipAndExpandCharacterRefs!(TextPred, TextPureWithWsPred, Flags)(text);
186         else
187             end = skipAndExpandCharacterRefs!(TextPred, TextPureNoWsPred, Flags)(text);
188 
189         // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after >
190         if (Flags & ParsingFlags.TrimWhitespace)
191         {
192             // FIXME: Needing refactor or cleanup -@zhangxueping at 2021-04-01T19:53:47+08:00
193             // 
194             if (Flags & ParsingFlags.NormalizeWhitespace)
195             {
196                 // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end
197                 if (end[-1] == ' ')
198                     end = end[-1 .. $];
199             }
200             else
201             {
202                 // Backup until non-whitespace character is found
203                 while (WhitespacePred.test(end[-1]))
204                     end = end[-1 .. $ - 1];
205             }
206         }
207 
208         // If characters are still left between end and value (this test is only necessary if normalization is enabled)
209         // Create new data node
210         if (!(Flags & ParsingFlags.DataNodes))
211         {
212             Element data = new Element(NodeType.Text);
213             data.setText = cast(string)value[0 .. value.length - end.length].dup;
214             node.appendNode(data);
215         }
216 
217         // Add data to parent node if no data exists yet
218         if (!(Flags & ParsingFlags.EelementValues))
219             if (node.getText.length == 0)
220                 node.setText = cast(string)value[0 ..value.length - end.length];
221 
222         // Place zero terminator after value
223         if (!(Flags & ParsingFlags.StringTerminators))
224         {
225             ubyte ch = text[0];
226             end[0] ='\0';
227             return ch;      // Return character that ends data; this is required because zero terminator overwritten it
228         }
229         else
230         // Return character that ends data
231         return text[0];
232     }
233 
234     static private Element parseElement(int Flags)(ref char[] text)
235     {
236         Element element = new Element();
237         char[] prefix = text;
238         //skip ElementNamePred
239         skip!(ElementNamePred)(text);
240         if(text == prefix)
241             throw new XmlParsingException("expected element name or prefix", text);
242         if(text.length >0 && text[0] == ':')
243         {
244             element.namespacePrefix = prefix[0 .. prefix.length - text.length].dup;
245             text = text[1 .. $ ];
246             char[] name = text;
247             //skip NodeNamePred
248             skip!(NodeNamePred)(text);
249             if(text == name)
250                 throw new XmlParsingException("expected element local name", text);
251             element.setName = name[0 .. name.length - text.length].dup;
252         }
253         else{
254             element.setName = prefix[ 0 .. prefix.length - text.length].dup;            
255         }
256 
257         //skip WhitespacePred
258         skip!(WhitespacePred)(text);
259         parseNodeAttributes!(Flags)(text , element);
260         if(text.length > 0 && text[0] == '>')
261         {
262             text = text[1 .. $];
263             char[] contents = text;
264             char[] contents_end = null;
265             if(!(Flags & ParsingFlags.OpenOnly))
266             {    
267                 contents_end = parseNodeContents!(Flags)(text , element);
268             }
269             if(contents_end.length != contents.length )
270             {
271                 element.contents = cast(string)contents[0 .. contents.length - contents_end.length].dup;
272             }
273         }
274         else if(text.length > 0 && text[0] == '/')
275         {
276             text = text[1 .. $ ];
277             if(text[0] != '>')
278                 throw new XmlParsingException("expected >", text);
279 
280             text = text[1 .. $ ];
281 
282             if(Flags & ParsingFlags.OpenOnly)
283                 throw new XmlParsingException("open_only, but closed", text);
284         }
285         else 
286             throw new XmlParsingException("expected >", text);
287         // Place zero terminator after name 
288         // no need.
289         return element;
290     }
291 
292     static private char[] parseNodeContents(int Flags)(ref char[] text , Element node)
293     {
294         char[] retval;
295         while(1)
296         {
297             char[] contents_start = text;
298             skip!(WhitespacePred)(text);
299             char next_char = text[0];
300 
301             after_data_node:
302 
303             switch(next_char)
304             {
305                 case '<':
306                 if(text[1] == '/')
307                 {
308                     retval = text;
309                     text = text[2 .. $ ];
310                     if(Flags & ParsingFlags.ValidateClosingTags)
311                     {
312                         string closing_name = cast(string)text.dup;
313                         skip!(NodeNamePred)(text);
314                         if(closing_name == node.getName)
315                             throw new XmlParsingException("invalid closing tag name", text);
316                     }
317                     else
318                     {
319                         skip!(NodeNamePred)(text);
320                     }
321 
322                     skip!(WhitespacePred)(text);
323                     if(text[0] != '>')
324                         throw new XmlParsingException("expected >", text);
325                     text = text[1 .. $];
326                     if(Flags & ParsingFlags.OpenOnly)
327                         throw new XmlParsingException("Unclosed element actually closed.", text);
328 
329                     return retval;
330                 }
331                 else
332                 {
333                     text = text[1 .. $ ];
334                     if(Element child = parseNode!(Flags & ~ParsingFlags.OpenOnly)(text))
335                         node.appendNode(child);
336                 }
337                 break;
338             default:
339                  next_char = parseAndAppendData!(Flags)(node, text, contents_start);
340                 goto after_data_node;   // Bypass regular processing after data nodes
341             }
342         }
343 
344         return null;
345     }
346 
347     static private void parseNodeAttributes(int Flags)(ref char[] text , Element node)
348     {
349         int index = 0;
350 
351         while(text.length > 0 && AttributeNamePred.test(text[0]))
352         {
353             char[] name = text;
354             text = text[1 .. $ ];
355             skip!(AttributeNamePred)(text);
356             if(text == name)
357                 throw new XmlParsingException("expected attribute name", name);
358 
359             Attribute attribute = new Attribute();
360             attribute.setName = cast(string)name[0 .. name.length - text.length].dup;
361 
362             node.appendAttribute(attribute);
363 
364             skip!(WhitespacePred)(text);
365 
366             if(text.length ==0 || text[0] != '=')
367                 throw new XmlParsingException("expected =", text);
368 
369             text = text[1 .. $ ];
370 
371             skip!(WhitespacePred)(text);
372 
373             char quote = text[0];
374             if(quote != '\'' && quote != '"')
375                 throw new XmlParsingException("expected ' or \"", text);
376 
377             text = text[1 .. $ ];
378             char[] value = text ;
379             char[] end;
380             const int AttFlags = Flags & ~ParsingFlags.NormalizeWhitespace;
381 
382             if(quote == '\'')
383                 end = skipAndExpandCharacterRefs!(AttributeValuePred!'\'' , AttributeValuePurePred!('\'') , AttFlags)(text);
384             else
385                 end = skipAndExpandCharacterRefs!(AttributeValuePred!('"') , AttributeValuePurePred!('"') , AttFlags)(text);
386 
387             attribute.setValue = cast(string)value[0 .. value.length - end.length].dup;
388 
389             if(text.length > 0 && text[0] != quote)
390                 throw new XmlParsingException("expected ' or \"", text);
391 
392             text = text[1 .. $ ];
393 
394             skip!(WhitespacePred)(text);
395         }
396     }    
397 
398 
399     static private void parseBom(ref char[] text)
400     {
401         if(text[0] == 0xEF 
402         && text[1] == 0xBB 
403         && text[2] == 0xBF)
404         {
405             text = text[3 .. $ ];
406         }
407     }
408 
409     static private Element parseXmlDeclaration(int Flags)(ref char[] text)
410     {
411         static if (Flags & ParsingFlags.DeclarationNode) {
412             // Create declaration
413             Element declaration = new Element(NodeType.Declaration);
414 
415             // Skip whitespace before attributes or ?>
416             skip!WhitespacePred(text);
417             // Parse declaration attributes
418             parseNodeAttributes!Flags(text, declaration);
419 
420             // Skip ?>
421             if (text[0] != '?' || text[1] != '>') 
422                 throw new XmlParsingException("expected ?>", text);
423             text = text[2 .. $ ];
424 
425             return declaration;
426         } else {
427             // If parsing of declaration is disabled
428             // Skip until end of declaration
429             while (text[0] != '?' || text[1] != '>')
430             {
431                 if (!text[0]) 
432                 throw new XmlParsingException("unexpected end of data", text);
433                 text = text[1 .. $ ];
434             }
435             text = text[2 .. $ ];    // Skip '?>'
436             return null;
437         }
438     }
439 
440     static private Element parsePI(int Flags)(ref char[] text)
441     {
442         // If creation of PI nodes is enabled
443         if (Flags & ParsingFlags.PiNodes)
444         {
445             // Create pi node
446             Element pi = new Element(NodeType.ProcessingInstruction);
447 
448             // Extract PI target name
449             char[] name = text;
450             skip!NodeNamePred(text);
451             if (text == name) 
452                 throw new XmlParsingException("expected PI target", text);
453             pi.setName = cast(string)name[0 .. name.length - text.length].dup;
454 
455             // Skip whitespace between pi target and pi
456             skip!WhitespacePred(text);
457 
458             // Remember start of pi
459             char[] value = text;
460 
461             // Skip to '?>'
462             while (text[0] != '?' || text[1] != '>')
463             {
464                 if (text == null)
465                     throw new XmlParsingException("unexpected end of data", text);
466                 text = text[1 .. $ ];
467             }
468 
469             // Set pi value (verbatim, no entity expansion or whitespace normalization)
470             pi.setText = cast(string)value[ 0 .. value.length - text.length ].dup;
471 
472             // Place zero terminator after name and value
473             // no need
474 
475             text = text[2 .. $ ];                          // Skip '?>'
476             return pi;
477         }
478         else
479         {
480             // Skip to '?>'
481             while (text[0] != '?' || text[1] != '>')
482             {
483                 if (text[0] == '\0')
484                     throw new XmlParsingException("unexpected end of data", text);
485                 text = text[1 .. $ ];
486             }
487             text = text[2 .. $ ];    // Skip '?>'
488             return null;
489         }
490     }
491 
492     static private Element parseComment(int Flags)(ref char[] text)
493     {
494         static if (Flags & ParsingFlags.CommentNodes) {
495             // Remember value start
496             auto value = text;
497 
498             // Skip until end of comment
499             while (text[0] != '-' || text[1] != '-' || text[2] != '>')
500             {
501                 if (!text[0]) throw new XmlParsingException("unexpected end of data", text);
502                 text= text[1 .. $];
503             }
504 
505             // Create comment node
506             Element comment = new Element(NodeType.Comment);
507             comment.setText = cast(string)value[0 .. value.length - text.length].dup;
508 
509             // Place zero terminator after comment value
510             // no need
511 
512             text = text[3 .. $ ];     // Skip '-->'
513             return comment;
514         } else { 
515             // If parsing of comments is disabled
516             // Skip until end of comment
517             while (text[0] != '-' || text[1] != '-' || text[2] != '>')
518             {
519                 if (!text[0]) throw new XmlParsingException("unexpected end of data", text);
520                 text = text[1 .. $];
521             }
522             text = text [3 .. $];     // Skip '-->'
523             return null;      // Do not produce comment node
524         }
525 
526 
527     }
528 
529     // Parse DOCTYPE
530 
531     static private Element parseDoctype(int Flags)(ref char[] text)
532     {
533         // Remember value start
534         char[] value = text;
535 
536         // Skip to >
537         while (text[0] != '>')
538         {
539             // Determine character type
540             switch (text[0])
541             {
542 
543             // If '[' encountered, scan for matching ending ']' using naive algorithm with depth
544             // This works for all W3C test files except for 2 most wicked
545             case ('['):
546             {
547                 text = text[1 .. $ ];     // Skip '['
548                 int depth = 1;
549                 while (depth > 0)
550                 {
551                     switch (text[0])
552                     {
553                         case '[': ++depth; break;
554                         case ']': --depth; break;
555                         default : throw new XmlParsingException("unexpected end of data", text);
556                     }
557                     text = text[1 .. $];
558                 }
559                 break;
560             }
561 
562             // Error on end of text
563             case '\0':
564                 throw new XmlParsingException("unexpected end of data", text);
565 
566             // Other character, skip it
567             default:
568                 text = text[1 .. $ ];
569 
570             }
571         }
572 
573         // If DOCTYPE nodes enabled
574         if (Flags & ParsingFlags.DoctypeNode)
575         {
576             // Create a new doctype node
577             Element doctype = new Element(NodeType.DocumentType);
578             doctype.setText = cast(string)value[ 0 .. value.length - text.length].dup;
579 
580             // Place zero terminator after value
581             // no need
582 
583             text = text[1 .. $ ];      // skip '>'
584             return doctype;
585         }
586         else
587         {
588             text = text[1 .. $ ];      // skip '>'
589             return null;
590         }
591     }
592 
593 }