1 module hunt.xml.DocumentParser; 2 3 import hunt.xml.Attribute; 4 import hunt.xml.Common; 5 import hunt.xml.Document; 6 import hunt.xml.Element; 7 import hunt.xml.Node; 8 import hunt.xml.Internal; 9 10 /** 11 * 12 */ 13 class DocumentParser { 14 15 /** 16 * 17 * Params: 18 * stext = 19 * null = 20 * Returns: 21 */ 22 static Document parse(ParsingFlags Flags = ParsingFlags.Default)(string stext , Document parent = null) 23 { 24 Document document = new Document(); 25 // document.removeAllNodes(); 26 // document.removeAllAttributes(); 27 document.setParent(parent ? parent.firstNode() : null); 28 char[] text = cast(char[])stext.dup; 29 30 parseBom(text); 31 32 size_t index = 0; 33 size_t length = text.length; 34 while(1) 35 { 36 skip!(WhitespacePred)(text); 37 if(text.length == 0) 38 break; 39 if(text[index] =='<') 40 { 41 ++index; 42 text = text[index .. $]; 43 Element node = parseNode!(Flags)(text); 44 if(node !is null) 45 { 46 document.appendNode(node); 47 if(Flags & (ParsingFlags.OpenOnly | ParsingFlags.ParseOne)) 48 { 49 if(node.getType() == NodeType.Comment) 50 break; 51 } 52 } 53 index=0; 54 } 55 else 56 throw new XmlParsingException("expected <", text); 57 } 58 59 if(!document.firstNode()) 60 throw new XmlParsingException("no root element", text[index .. $ ]); 61 62 return document; 63 } 64 65 static private Element parseNode(int Flags)(ref char[] text) 66 { 67 switch(text[0]) 68 { 69 // <... 70 default: 71 return parseElement!Flags(text); 72 73 // <?... 74 case '?': 75 text = text[1 .. $ ]; 76 if(((text[0] == 'x' ) || (text[0] == 'X')) && 77 ((text[1] == 'm' ) || (text[1] == 'M')) && 78 ((text[2] == 'l' ) || (text[2] == 'L')) && 79 WhitespacePred.test(text[3])) 80 { 81 text = text[4 .. $]; 82 return parseXmlDeclaration!Flags(text); 83 } 84 else 85 { 86 return parsePI!Flags(text); 87 } 88 89 case '!': 90 switch(text[1]) 91 { 92 case '-': 93 if(text[2] == '-') 94 { 95 text = text[3 .. $ ]; 96 return parseComment!Flags(text); 97 } 98 break; 99 case ('['): 100 if (text[2] == ('C') && text[3] == ('D') && text[4] == ('A') && 101 text[5] == ('T') && text[6] == ('A') && text[7] == ('[')) 102 { 103 // '<![CDATA[' - cdata 104 text = text[8 .. $ ]; // Skip '![CDATA[' 105 return parseCdata!Flags(text); 106 } 107 break; 108 109 // <!D 110 case ('D'): 111 if (text[2] == ('O') && text[3] == ('C') && text[4] == ('T') && 112 text[5] == ('Y') && text[6] == ('P') && text[7] == ('E') && 113 WhitespacePred.test(text[8])) 114 { 115 // '<!DOCTYPE ' - doctype 116 text = text[9 .. $ ]; // skip '!DOCTYPE ' 117 return parseDoctype!Flags(text); 118 } 119 break; 120 default: 121 break; 122 123 } 124 125 text = text[1 .. $ ]; // Skip ! 126 while (text[0] != ('>')) 127 { 128 if (text == null) 129 throw new XmlParsingException("unexpected end of data", text); 130 text = text[1 .. $ ]; 131 } 132 text = text[1 .. $ ]; // Skip '>' 133 return null; // No node recognized 134 135 } 136 } 137 138 static private Element parseCdata(int Flags)(ref char[] text) 139 { 140 // If CDATA is disabled 141 if (Flags & ParsingFlags.DataNodes) 142 { 143 // Skip until end of cdata 144 while (text[0] != ']' || text[1] != ']' || text[2] != '>') 145 { 146 if (!text[0]) 147 throw new XmlParsingException("unexpected end of data", text); 148 text = text[1 .. $]; 149 } 150 text = text[3 .. $]; // Skip ]]> 151 return null; // Do not produce CDATA node 152 } 153 154 // Skip until end of cdata 155 char[] value = text; 156 while (text[0] != (']') || text[1] != (']') || text[2] != ('>')) 157 { 158 if (!text[0]) 159 throw new XmlParsingException("unexpected end of data", text); 160 text = text[1 .. $ ]; 161 } 162 163 // Create new cdata node 164 Element cdata = new Element(NodeType.CDATA); 165 cdata.setText = cast(string)value[ 0 .. value.length - text.length].dup; 166 167 // Place zero terminator after value 168 169 text = text[3 .. $ ]; // Skip ]]> 170 return cdata; 171 } 172 173 static private char parseAndAppendData(int Flags)(Element node, ref char []text, char[] contents_start) 174 { 175 // Backup to contents start if whitespace trimming is disabled 176 if (!(Flags & ParsingFlags.TrimWhitespace)) 177 text = contents_start; 178 179 // Skip until end of data 180 char [] value = text; 181 char []end; 182 if (Flags & ParsingFlags.NormalizeWhitespace) 183 end = skipAndExpandCharacterRefs!(TextPred, TextPureWithWsPred, Flags)(text); 184 else 185 end = skipAndExpandCharacterRefs!(TextPred, TextPureNoWsPred, Flags)(text); 186 187 // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after > 188 if (Flags & ParsingFlags.TrimWhitespace) 189 { 190 if (Flags & ParsingFlags.NormalizeWhitespace) 191 { 192 // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end 193 if (end[-1] == ' ') 194 end = end[-1 .. $]; 195 } 196 else 197 { 198 // Backup until non-whitespace character is found 199 while (WhitespacePred.test(end[-1])) 200 end = end[-1 .. $ - 1]; 201 } 202 } 203 204 // If characters are still left between end and value (this test is only necessary if normalization is enabled) 205 // Create new data node 206 if (!(Flags & ParsingFlags.DataNodes)) 207 { 208 Element data = new Element(NodeType.Text); 209 data.setText = cast(string)value[0 .. value.length - end.length].dup; 210 node.appendNode(data); 211 } 212 213 // Add data to parent node if no data exists yet 214 if (!(Flags & ParsingFlags.EelementValues)) 215 if (node.getText.length == 0) 216 node.setText = cast(string)value[0 ..value.length - end.length]; 217 218 // Place zero terminator after value 219 if (!(Flags & ParsingFlags.StringTerminators)) 220 { 221 ubyte ch = text[0]; 222 end[0] ='\0'; 223 return ch; // Return character that ends data; this is required because zero terminator overwritten it 224 } 225 else 226 // Return character that ends data 227 return text[0]; 228 } 229 230 static private Element parseElement(int Flags)(ref char[] text) 231 { 232 Element element = new Element(); 233 char[] prefix = text; 234 //skip ElementNamePred 235 skip!(ElementNamePred)(text); 236 if(text == prefix) 237 throw new XmlParsingException("expected element name or prefix", text); 238 if(text.length >0 && text[0] == ':') 239 { 240 element.namespacePrefix = prefix[0 .. prefix.length - text.length].dup; 241 text = text[1 .. $ ]; 242 char[] name = text; 243 //skip NodeNamePred 244 skip!(NodeNamePred)(text); 245 if(text == name) 246 throw new XmlParsingException("expected element local name", text); 247 element.setName = name[0 .. name.length - text.length].dup; 248 } 249 else{ 250 element.setName = prefix[ 0 .. prefix.length - text.length].dup; 251 } 252 253 //skip WhitespacePred 254 skip!(WhitespacePred)(text); 255 parseNodeAttributes!(Flags)(text , element); 256 if(text.length > 0 && text[0] == '>') 257 { 258 text = text[1 .. $]; 259 char[] contents = text; 260 char[] contents_end = null; 261 if(!(Flags & ParsingFlags.OpenOnly)) 262 { 263 contents_end = parseNodeContents!(Flags)(text , element); 264 } 265 if(contents_end.length != contents.length ) 266 { 267 element.contents = cast(string)contents[0 .. contents.length - contents_end.length].dup; 268 } 269 } 270 else if(text.length > 0 && text[0] == '/') 271 { 272 text = text[1 .. $ ]; 273 if(text[0] != '>') 274 throw new XmlParsingException("expected >", text); 275 276 text = text[1 .. $ ]; 277 278 if(Flags & ParsingFlags.OpenOnly) 279 throw new XmlParsingException("open_only, but closed", text); 280 } 281 else 282 throw new XmlParsingException("expected >", text); 283 // Place zero terminator after name 284 // no need. 285 return element; 286 } 287 288 static private char[] parseNodeContents(int Flags)(ref char[] text , Element node) 289 { 290 char[] retval; 291 292 while(1) 293 { 294 char[] contents_start = text; 295 skip!(WhitespacePred)(text); 296 char next_char = text[0]; 297 298 after_data_node: 299 300 switch(next_char) 301 { 302 case '<': 303 if(text[1] == '/') 304 { 305 retval = text; 306 text = text[2 .. $ ]; 307 if(Flags & ParsingFlags.ValidateClosingTags) 308 { 309 string closing_name = cast(string)text.dup; 310 skip!(NodeNamePred)(text); 311 if(closing_name == node.getName) 312 throw new XmlParsingException("invalid closing tag name", text); 313 } 314 else 315 { 316 skip!(NodeNamePred)(text); 317 } 318 319 skip!(WhitespacePred)(text); 320 if(text[0] != '>') 321 throw new XmlParsingException("expected >", text); 322 text = text[1 .. $]; 323 if(Flags & ParsingFlags.OpenOnly) 324 throw new XmlParsingException("Unclosed element actually closed.", text); 325 326 return retval; 327 } 328 else 329 { 330 text = text[1 .. $ ]; 331 if(Element child = parseNode!(Flags & ~ParsingFlags.OpenOnly)(text)) 332 node.appendNode(child); 333 } 334 break; 335 default: 336 next_char = parseAndAppendData!(Flags)(node, text, contents_start); 337 goto after_data_node; // Bypass regular processing after data nodes 338 } 339 } 340 341 return null; 342 } 343 344 static private void parseNodeAttributes(int Flags)(ref char[] text , Element node) 345 { 346 int index = 0; 347 348 while(text.length > 0 && AttributeNamePred.test(text[0])) 349 { 350 char[] name = text; 351 text = text[1 .. $ ]; 352 skip!(AttributeNamePred)(text); 353 if(text == name) 354 throw new XmlParsingException("expected attribute name", name); 355 356 Attribute attribute = new Attribute(); 357 attribute.setName = cast(string)name[0 .. name.length - text.length].dup; 358 359 node.appendAttribute(attribute); 360 361 skip!(WhitespacePred)(text); 362 363 if(text.length ==0 || text[0] != '=') 364 throw new XmlParsingException("expected =", text); 365 366 text = text[1 .. $ ]; 367 368 skip!(WhitespacePred)(text); 369 370 char quote = text[0]; 371 if(quote != '\'' && quote != '"') 372 throw new XmlParsingException("expected ' or \"", text); 373 374 text = text[1 .. $ ]; 375 char[] value = text ; 376 char[] end; 377 const int AttFlags = Flags & ~ParsingFlags.NormalizeWhitespace; 378 379 if(quote == '\'') 380 end = skipAndExpandCharacterRefs!(AttributeValuePred!'\'' , AttributeValuePurePred!('\'') , AttFlags)(text); 381 else 382 end = skipAndExpandCharacterRefs!(AttributeValuePred!('"') , AttributeValuePurePred!('"') , AttFlags)(text); 383 384 attribute.setValue = cast(string)value[0 .. value.length - end.length].dup; 385 386 if(text.length > 0 && text[0] != quote) 387 throw new XmlParsingException("expected ' or \"", text); 388 389 text = text[1 .. $ ]; 390 391 skip!(WhitespacePred)(text); 392 } 393 } 394 395 396 static private void parseBom(ref char[] text) 397 { 398 if(text[0] == 0xEF 399 && text[1] == 0xBB 400 && text[2] == 0xBF) 401 { 402 text = text[3 .. $ ]; 403 } 404 } 405 406 static private Element parseXmlDeclaration(int Flags)(ref char[] text) 407 { 408 static if (Flags & ParsingFlags.DeclarationNode) { 409 // Create declaration 410 Element declaration = new Element(NodeType.Declaration); 411 412 // Skip whitespace before attributes or ?> 413 skip!WhitespacePred(text); 414 // Parse declaration attributes 415 parseNodeAttributes!Flags(text, declaration); 416 417 // Skip ?> 418 if (text[0] != '?' || text[1] != '>') 419 throw new XmlParsingException("expected ?>", text); 420 text = text[2 .. $ ]; 421 422 return declaration; 423 } else { 424 // If parsing of declaration is disabled 425 // Skip until end of declaration 426 while (text[0] != '?' || text[1] != '>') 427 { 428 if (!text[0]) 429 throw new XmlParsingException("unexpected end of data", text); 430 text = text[1 .. $ ]; 431 } 432 text = text[2 .. $ ]; // Skip '?>' 433 return null; 434 } 435 } 436 437 static private Element parsePI(int Flags)(ref char[] text) 438 { 439 // If creation of PI nodes is enabled 440 if (Flags & ParsingFlags.PiNodes) 441 { 442 // Create pi node 443 Element pi = new Element(NodeType.ProcessingInstruction); 444 445 // Extract PI target name 446 char[] name = text; 447 skip!NodeNamePred(text); 448 if (text == name) 449 throw new XmlParsingException("expected PI target", text); 450 pi.setName = cast(string)name[0 .. name.length - text.length].dup; 451 452 // Skip whitespace between pi target and pi 453 skip!WhitespacePred(text); 454 455 // Remember start of pi 456 char[] value = text; 457 458 // Skip to '?>' 459 while (text[0] != '?' || text[1] != '>') 460 { 461 if (text == null) 462 throw new XmlParsingException("unexpected end of data", text); 463 text = text[1 .. $ ]; 464 } 465 466 // Set pi value (verbatim, no entity expansion or whitespace normalization) 467 pi.setText = cast(string)value[ 0 .. value.length - text.length ].dup; 468 469 // Place zero terminator after name and value 470 // no need 471 472 text = text[2 .. $ ]; // Skip '?>' 473 return pi; 474 } 475 else 476 { 477 // Skip to '?>' 478 while (text[0] != '?' || text[1] != '>') 479 { 480 if (text[0] == '\0') 481 throw new XmlParsingException("unexpected end of data", text); 482 text = text[1 .. $ ]; 483 } 484 text = text[2 .. $ ]; // Skip '?>' 485 return null; 486 } 487 } 488 489 static private Element parseComment(int Flags)(ref char[] text) 490 { 491 static if (Flags & ParsingFlags.CommentNodes) { 492 // Remember value start 493 auto value = text; 494 495 // Skip until end of comment 496 while (text[0] != '-' || text[1] != '-' || text[2] != '>') 497 { 498 if (!text[0]) throw new XmlParsingException("unexpected end of data", text); 499 text= text[1 .. $]; 500 } 501 502 // Create comment node 503 Element comment = new Element(NodeType.Comment); 504 comment.setText = cast(string)value[0 .. value.length - text.length].dup; 505 506 // Place zero terminator after comment value 507 // no need 508 509 text = text[3 .. $ ]; // Skip '-->' 510 return comment; 511 } else { 512 // If parsing of comments is disabled 513 // Skip until end of comment 514 while (text[0] != '-' || text[1] != '-' || text[2] != '>') 515 { 516 if (!text[0]) throw new XmlParsingException("unexpected end of data", text); 517 text = text[1 .. $]; 518 } 519 text = text [3 .. $]; // Skip '-->' 520 return null; // Do not produce comment node 521 } 522 523 524 } 525 526 // Parse DOCTYPE 527 528 static private Element parseDoctype(int Flags)(ref char[] text) 529 { 530 // Remember value start 531 char[] value = text; 532 533 // Skip to > 534 while (text[0] != '>') 535 { 536 // Determine character type 537 switch (text[0]) 538 { 539 540 // If '[' encountered, scan for matching ending ']' using naive algorithm with depth 541 // This works for all W3C test files except for 2 most wicked 542 case ('['): 543 { 544 text = text[1 .. $ ]; // Skip '[' 545 int depth = 1; 546 while (depth > 0) 547 { 548 switch (text[0]) 549 { 550 case '[': ++depth; break; 551 case ']': --depth; break; 552 default : throw new XmlParsingException("unexpected end of data", text); 553 } 554 text = text[1 .. $]; 555 } 556 break; 557 } 558 559 // Error on end of text 560 case '\0': 561 throw new XmlParsingException("unexpected end of data", text); 562 563 // Other character, skip it 564 default: 565 text = text[1 .. $ ]; 566 567 } 568 } 569 570 // If DOCTYPE nodes enabled 571 if (Flags & ParsingFlags.DoctypeNode) 572 { 573 // Create a new doctype node 574 Element doctype = new Element(NodeType.DocumentType); 575 doctype.setText = cast(string)value[ 0 .. value.length - text.length].dup; 576 577 // Place zero terminator after value 578 // no need 579 580 text = text[1 .. $ ]; // skip '>' 581 return doctype; 582 } 583 else 584 { 585 text = text[1 .. $ ]; // skip '>' 586 return null; 587 } 588 } 589 590 }