hunt.xml.DocumentParser source code

1 module hunt.xml.DocumentParser;
2 
3 import hunt.xml.Attribute;
4 import hunt.xml.Common;
5 import hunt.xml.Document;
6 import hunt.xml.Element;
7 import hunt.xml.Node;
8 import hunt.xml.Internal;
9 
10 /** 
11  * 
12  */
13 class DocumentParser {
14 
15     /** 
16      * 
17      * Params:
18      *   stext = 
19      *   null = 
20      * Returns: 
21      */
22     static Document parse(ParsingFlags Flags = ParsingFlags.Default)(string stext , Document parent = null)
23     {
24         Document document = new Document();
25         // document.removeAllNodes();
26         // document.removeAllAttributes();
27         document.setParent(parent ? parent.firstNode() : null);
28         char[] text = cast(char[])stext.dup;
29 
30         parseBom(text);
31 
32         size_t index = 0;
33         size_t length = text.length;
34         while(1)
35         {
36             skip!(WhitespacePred)(text); 
37             if(text.length == 0)
38                 break;
39             if(text[index] =='<')
40             {
41                 ++index;
42                 text = text[index .. $];
43                 Element  node = parseNode!(Flags)(text);
44                 if(node !is null)
45                 {
46                     document.appendNode(node);
47                     if(Flags & (ParsingFlags.OpenOnly | ParsingFlags.ParseOne))
48                     {
49                         if(node.getType()  == NodeType.Comment)
50                             break;
51                     }
52                 }
53                 index=0;
54             }
55             else
56                 throw new XmlParsingException("expected <", text);
57         }
58 
59         if(!document.firstNode())
60             throw new XmlParsingException("no root element", text[index .. $ ]);
61 
62         return document;
63     }
64 
65     static private Element parseNode(int Flags)(ref char[] text)
66     {
67         switch(text[0])
68         {
69             // <...
70             default:
71                 return parseElement!Flags(text);
72 
73             // <?...
74             case '?':
75                 text = text[1 .. $ ];
76                 if(((text[0] == 'x' ) || (text[0] == 'X')) &&
77                 ((text[1] == 'm' ) || (text[1] == 'M')) &&
78                 ((text[2] == 'l' ) || (text[2] == 'L')) &&
79                 WhitespacePred.test(text[3]))
80                 {
81                     text = text[4 .. $];
82                     return parseXmlDeclaration!Flags(text);
83                 }
84                 else
85                 {
86                     return parsePI!Flags(text);
87                 }
88 
89             case '!':
90                 switch(text[1])
91                 {
92                 case '-':
93                     if(text[2] == '-')
94                     {
95                         text = text[3 .. $ ];
96                         return parseComment!Flags(text);
97                     } 
98                     break;
99                 case ('['):
100                     if (text[2] == ('C') && text[3] == ('D') && text[4] == ('A') &&
101                         text[5] == ('T') && text[6] == ('A') && text[7] == ('['))
102                     {
103                         // '<![CDATA[' - cdata
104                         text = text[8 .. $ ];     // Skip '![CDATA['
105                         return parseCdata!Flags(text);
106                     }
107                     break;
108 
109                 // <!D
110                 case ('D'):
111                     if (text[2] == ('O') && text[3] == ('C') && text[4] == ('T') &&
112                         text[5] == ('Y') && text[6] == ('P') && text[7] == ('E') &&
113                         WhitespacePred.test(text[8]))
114                     {
115                         // '<!DOCTYPE ' - doctype
116                         text = text[9 .. $ ];      // skip '!DOCTYPE '
117                         return parseDoctype!Flags(text);
118                     }
119                     break;
120                 default:
121                     break;
122 
123                 } 
124 
125                  text = text[1 .. $ ];     // Skip !
126                 while (text[0] != ('>'))
127                 {
128                     if (text == null)
129                         throw new XmlParsingException("unexpected end of data", text);
130                     text = text[1 .. $ ];
131                 }
132                 text = text[1 .. $ ];     // Skip '>'
133                 return null;   // No node recognized
134 
135         }
136     }
137 
138     static private Element parseCdata(int Flags)(ref char[] text)
139     {
140         // If CDATA is disabled
141         if (Flags & ParsingFlags.DataNodes)
142         {
143             // Skip until end of cdata
144             while (text[0] != ']' || text[1] != ']' || text[2] != '>')
145             {
146                 if (!text[0])
147                     throw new XmlParsingException("unexpected end of data", text);
148                 text = text[1 .. $];
149             }
150             text = text[3 .. $];      // Skip ]]>
151             return null;       // Do not produce CDATA node
152         }
153 
154         // Skip until end of cdata
155         char[] value = text;
156         while (text[0] != (']') || text[1] != (']') || text[2] != ('>'))
157         {
158             if (!text[0])
159                 throw new XmlParsingException("unexpected end of data", text);
160             text = text[1 .. $ ];
161         }
162 
163         // Create new cdata node
164         Element cdata = new Element(NodeType.CDATA);
165         cdata.setText = cast(string)value[ 0 .. value.length - text.length].dup;
166 
167         // Place zero terminator after value
168 
169         text = text[3 .. $ ];      // Skip ]]>
170         return cdata;
171     }
172 
173     static private char parseAndAppendData(int Flags)(Element node, ref char []text, char[] contents_start)
174     {
175         // Backup to contents start if whitespace trimming is disabled
176         if (!(Flags & ParsingFlags.TrimWhitespace))
177             text = contents_start;
178 
179         // Skip until end of data
180         char [] value = text;
181         char []end;
182         if (Flags & ParsingFlags.NormalizeWhitespace)
183             end = skipAndExpandCharacterRefs!(TextPred, TextPureWithWsPred, Flags)(text);
184         else
185             end = skipAndExpandCharacterRefs!(TextPred, TextPureNoWsPred, Flags)(text);
186 
187         // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after >
188         if (Flags & ParsingFlags.TrimWhitespace)
189         {
190             if (Flags & ParsingFlags.NormalizeWhitespace)
191             {
192                 // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end
193                 if (end[-1] == ' ')
194                     end = end[-1 .. $];
195             }
196             else
197             {
198                 // Backup until non-whitespace character is found
199                 while (WhitespacePred.test(end[-1]))
200                     end = end[-1 .. $ - 1];
201             }
202         }
203 
204         // If characters are still left between end and value (this test is only necessary if normalization is enabled)
205         // Create new data node
206         if (!(Flags & ParsingFlags.DataNodes))
207         {
208             Element data = new Element(NodeType.Text);
209             data.setText = cast(string)value[0 .. value.length - end.length].dup;
210             node.appendNode(data);
211         }
212 
213         // Add data to parent node if no data exists yet
214         if (!(Flags & ParsingFlags.EelementValues))
215             if (node.getText.length == 0)
216                 node.setText = cast(string)value[0 ..value.length - end.length];
217 
218         // Place zero terminator after value
219         if (!(Flags & ParsingFlags.StringTerminators))
220         {
221             ubyte ch = text[0];
222             end[0] ='\0';
223             return ch;      // Return character that ends data; this is required because zero terminator overwritten it
224         }
225         else
226         // Return character that ends data
227         return text[0];
228     }
229 
230     static private Element parseElement(int Flags)(ref char[] text)
231     {
232         Element element = new Element();
233         char[] prefix = text;
234         //skip ElementNamePred
235         skip!(ElementNamePred)(text);
236         if(text == prefix)
237             throw new XmlParsingException("expected element name or prefix", text);
238         if(text.length >0 && text[0] == ':')
239         {
240             element.namespacePrefix = prefix[0 .. prefix.length - text.length].dup;
241             text = text[1 .. $ ];
242             char[] name = text;
243             //skip NodeNamePred
244             skip!(NodeNamePred)(text);
245             if(text == name)
246                 throw new XmlParsingException("expected element local name", text);
247             element.setName = name[0 .. name.length - text.length].dup;
248         }
249         else{
250             element.setName = prefix[ 0 .. prefix.length - text.length].dup;            
251         }
252 
253         //skip WhitespacePred
254         skip!(WhitespacePred)(text);
255         parseNodeAttributes!(Flags)(text , element);
256         if(text.length > 0 && text[0] == '>')
257         {
258             text = text[1 .. $];
259             char[] contents = text;
260             char[] contents_end = null;
261             if(!(Flags & ParsingFlags.OpenOnly))
262             {    
263                 contents_end = parseNodeContents!(Flags)(text , element);
264             }
265             if(contents_end.length != contents.length )
266             {
267                 element.contents = cast(string)contents[0 .. contents.length - contents_end.length].dup;
268             }
269         }
270         else if(text.length > 0 && text[0] == '/')
271         {
272             text = text[1 .. $ ];
273             if(text[0] != '>')
274                 throw new XmlParsingException("expected >", text);
275 
276             text = text[1 .. $ ];
277 
278             if(Flags & ParsingFlags.OpenOnly)
279                 throw new XmlParsingException("open_only, but closed", text);
280         }
281         else 
282             throw new XmlParsingException("expected >", text);
283         // Place zero terminator after name 
284         // no need.
285         return element;
286     }
287 
288     static private char[] parseNodeContents(int Flags)(ref char[] text , Element node)
289     {
290         char[] retval;
291 
292         while(1)
293         {
294             char[] contents_start = text;
295             skip!(WhitespacePred)(text);
296             char next_char = text[0];
297 
298             after_data_node:
299 
300             switch(next_char)
301             {
302                 case '<':
303                 if(text[1] == '/')
304                 {
305                     retval = text;
306                     text = text[2 .. $ ];
307                     if(Flags & ParsingFlags.ValidateClosingTags)
308                     {
309                         string closing_name = cast(string)text.dup;
310                         skip!(NodeNamePred)(text);
311                         if(closing_name == node.getName)
312                             throw new XmlParsingException("invalid closing tag name", text);
313                     }
314                     else
315                     {
316                         skip!(NodeNamePred)(text);
317                     }
318 
319                     skip!(WhitespacePred)(text);
320                     if(text[0] != '>')
321                         throw new XmlParsingException("expected >", text);
322                     text = text[1 .. $];
323                     if(Flags & ParsingFlags.OpenOnly)
324                         throw new XmlParsingException("Unclosed element actually closed.", text);
325 
326                     return retval;
327                 }
328                 else
329                 {
330                     text = text[1 .. $ ];
331                     if(Element child = parseNode!(Flags & ~ParsingFlags.OpenOnly)(text))
332                         node.appendNode(child);
333                 }
334                 break;
335             default:
336                  next_char = parseAndAppendData!(Flags)(node, text, contents_start);
337                 goto after_data_node;   // Bypass regular processing after data nodes
338             }
339         }
340 
341         return null;
342     }
343 
344     static private void parseNodeAttributes(int Flags)(ref char[] text , Element node)
345     {
346         int index = 0;
347 
348         while(text.length > 0 && AttributeNamePred.test(text[0]))
349         {
350             char[] name = text;
351             text = text[1 .. $ ];
352             skip!(AttributeNamePred)(text);
353             if(text == name)
354                 throw new XmlParsingException("expected attribute name", name);
355 
356             Attribute attribute = new Attribute();
357             attribute.setName = cast(string)name[0 .. name.length - text.length].dup;
358 
359             node.appendAttribute(attribute);
360 
361             skip!(WhitespacePred)(text);
362 
363             if(text.length ==0 || text[0] != '=')
364                 throw new XmlParsingException("expected =", text);
365 
366             text = text[1 .. $ ];
367 
368             skip!(WhitespacePred)(text);
369 
370             char quote = text[0];
371             if(quote != '\'' && quote != '"')
372                 throw new XmlParsingException("expected ' or \"", text);
373 
374             text = text[1 .. $ ];
375             char[] value = text ;
376             char[] end;
377             const int AttFlags = Flags & ~ParsingFlags.NormalizeWhitespace;
378 
379             if(quote == '\'')
380                 end = skipAndExpandCharacterRefs!(AttributeValuePred!'\'' , AttributeValuePurePred!('\'') , AttFlags)(text);
381             else
382                 end = skipAndExpandCharacterRefs!(AttributeValuePred!('"') , AttributeValuePurePred!('"') , AttFlags)(text);
383 
384             attribute.setValue = cast(string)value[0 .. value.length - end.length].dup;
385 
386             if(text.length > 0 && text[0] != quote)
387                 throw new XmlParsingException("expected ' or \"", text);
388 
389             text = text[1 .. $ ];
390 
391             skip!(WhitespacePred)(text);
392         }
393     }    
394 
395 
396     static private void parseBom(ref char[] text)
397     {
398         if(text[0] == 0xEF 
399         && text[1] == 0xBB 
400         && text[2] == 0xBF)
401         {
402             text = text[3 .. $ ];
403         }
404     }
405 
406     static private Element parseXmlDeclaration(int Flags)(ref char[] text)
407     {
408         static if (Flags & ParsingFlags.DeclarationNode) {
409             // Create declaration
410             Element declaration = new Element(NodeType.Declaration);
411 
412             // Skip whitespace before attributes or ?>
413             skip!WhitespacePred(text);
414             // Parse declaration attributes
415             parseNodeAttributes!Flags(text, declaration);
416 
417             // Skip ?>
418             if (text[0] != '?' || text[1] != '>') 
419                 throw new XmlParsingException("expected ?>", text);
420             text = text[2 .. $ ];
421 
422             return declaration;
423         } else {
424             // If parsing of declaration is disabled
425             // Skip until end of declaration
426             while (text[0] != '?' || text[1] != '>')
427             {
428                 if (!text[0]) 
429                 throw new XmlParsingException("unexpected end of data", text);
430                 text = text[1 .. $ ];
431             }
432             text = text[2 .. $ ];    // Skip '?>'
433             return null;
434         }
435     }
436 
437     static private Element parsePI(int Flags)(ref char[] text)
438     {
439         // If creation of PI nodes is enabled
440         if (Flags & ParsingFlags.PiNodes)
441         {
442             // Create pi node
443             Element pi = new Element(NodeType.ProcessingInstruction);
444 
445             // Extract PI target name
446             char[] name = text;
447             skip!NodeNamePred(text);
448             if (text == name) 
449                 throw new XmlParsingException("expected PI target", text);
450             pi.setName = cast(string)name[0 .. name.length - text.length].dup;
451 
452             // Skip whitespace between pi target and pi
453             skip!WhitespacePred(text);
454 
455             // Remember start of pi
456             char[] value = text;
457 
458             // Skip to '?>'
459             while (text[0] != '?' || text[1] != '>')
460             {
461                 if (text == null)
462                     throw new XmlParsingException("unexpected end of data", text);
463                 text = text[1 .. $ ];
464             }
465 
466             // Set pi value (verbatim, no entity expansion or whitespace normalization)
467             pi.setText = cast(string)value[ 0 .. value.length - text.length ].dup;
468 
469             // Place zero terminator after name and value
470             // no need
471 
472             text = text[2 .. $ ];                          // Skip '?>'
473             return pi;
474         }
475         else
476         {
477             // Skip to '?>'
478             while (text[0] != '?' || text[1] != '>')
479             {
480                 if (text[0] == '\0')
481                     throw new XmlParsingException("unexpected end of data", text);
482                 text = text[1 .. $ ];
483             }
484             text = text[2 .. $ ];    // Skip '?>'
485             return null;
486         }
487     }
488 
489     static private Element parseComment(int Flags)(ref char[] text)
490     {
491         static if (Flags & ParsingFlags.CommentNodes) {
492             // Remember value start
493             auto value = text;
494 
495             // Skip until end of comment
496             while (text[0] != '-' || text[1] != '-' || text[2] != '>')
497             {
498                 if (!text[0]) throw new XmlParsingException("unexpected end of data", text);
499                 text= text[1 .. $];
500             }
501 
502             // Create comment node
503             Element comment = new Element(NodeType.Comment);
504             comment.setText = cast(string)value[0 .. value.length - text.length].dup;
505 
506             // Place zero terminator after comment value
507             // no need
508 
509             text = text[3 .. $ ];     // Skip '-->'
510             return comment;
511         } else { 
512             // If parsing of comments is disabled
513             // Skip until end of comment
514             while (text[0] != '-' || text[1] != '-' || text[2] != '>')
515             {
516                 if (!text[0]) throw new XmlParsingException("unexpected end of data", text);
517                 text = text[1 .. $];
518             }
519             text = text [3 .. $];     // Skip '-->'
520             return null;      // Do not produce comment node
521         }
522 
523 
524     }
525 
526     // Parse DOCTYPE
527 
528     static private Element parseDoctype(int Flags)(ref char[] text)
529     {
530         // Remember value start
531         char[] value = text;
532 
533         // Skip to >
534         while (text[0] != '>')
535         {
536             // Determine character type
537             switch (text[0])
538             {
539 
540             // If '[' encountered, scan for matching ending ']' using naive algorithm with depth
541             // This works for all W3C test files except for 2 most wicked
542             case ('['):
543             {
544                 text = text[1 .. $ ];     // Skip '['
545                 int depth = 1;
546                 while (depth > 0)
547                 {
548                     switch (text[0])
549                     {
550                         case '[': ++depth; break;
551                         case ']': --depth; break;
552                         default : throw new XmlParsingException("unexpected end of data", text);
553                     }
554                     text = text[1 .. $];
555                 }
556                 break;
557             }
558 
559             // Error on end of text
560             case '\0':
561                 throw new XmlParsingException("unexpected end of data", text);
562 
563             // Other character, skip it
564             default:
565                 text = text[1 .. $ ];
566 
567             }
568         }
569 
570         // If DOCTYPE nodes enabled
571         if (Flags & ParsingFlags.DoctypeNode)
572         {
573             // Create a new doctype node
574             Element doctype = new Element(NodeType.DocumentType);
575             doctype.setText = cast(string)value[ 0 .. value.length - text.length].dup;
576 
577             // Place zero terminator after value
578             // no need
579 
580             text = text[1 .. $ ];      // skip '>'
581             return doctype;
582         }
583         else
584         {
585             text = text[1 .. $ ];      // skip '>'
586             return null;
587         }
588     }
589 
590 }