1 /*
2  * This file is part of gir-to-d.
3  *
4  * gir-to-d is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU Lesser General Public License
6  * as published by the Free Software Foundation, either version 3
7  * of the License, or (at your option) any later version.
8  *
9  * gir-to-d is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with gir-to-d.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 module gtd.XMLReader;
19 
20 import std.algorithm;
21 import std.array;
22 import std.conv : to;
23 import std.exception;
24 import std.range;
25 import std.string;
26 import std.traits: isSomeChar;
27 import std.uni;
28 
29 import gtd.WrapException;
30 
31 struct XMLNode
32 {
33 	XMLNodeType type;
34 
35 	string value;
36 	string[string] attributes;
37 }
38 
39 enum XMLNodeType
40 {
41 	None,
42 	PI,
43 	StartTag,
44 	Text,
45 	CData,
46 	DocType,
47 	Comment,
48 	EmptyTag,
49 	EndTag,
50 	DocumentEnd
51 }
52 
53 class XMLReader(T)
54 	if (isInputRange!T &&  isSomeChar!(ElementType!T) )
55 {
56 	XMLNode front;
57 	string fileName;
58 
59 	static if ( is( T == string ) )
60 		private CountLines!ByChar document;
61 	else
62 		private CountLines!T document;
63 
64 	/**
65 	 * Params:
66 	 *     document = The XML document to parse.
67 	 *     fileName = File name to print in diagnostic messages.
68 	 */
69 	this(T document, string fileName = null)
70 	{
71 		static if ( is( T == string ) )
72 			this.document = CountLines!ByChar(ByChar(document));
73 		else
74 			this.document = CountLines!T(document);
75 
76 		this.fileName = fileName;
77 
78 		popFront();
79 	}
80 
81 	@property size_t line()
82 	{
83 		return document.line;
84 	}
85 	alias lineNumber = line;
86 
87 	void popFront()
88 	{
89 		front = XMLNode();
90 
91 		if ( document.empty )
92 		{
93 			front.type = XMLNodeType.DocumentEnd;
94 			return;
95 		}
96 
97 		if ( document.front == '<' )
98 			parseTag();
99 		else
100 			parseText();
101 	}
102 
103 	@property bool empty()
104 	{
105 		return document.empty && front.type == XMLNodeType.DocumentEnd;
106 	}
107 
108 	private void parseTag()
109 	{
110 		document.popFront();
111 
112 		switch ( document.front )
113 		{
114 			case '!':
115 				document.popFront();
116 				switch ( document.front )
117 				{
118 					case '[':
119 						enforce(document.skipOver("[CDATA["));
120 						parseCDATA();
121 						break;
122 					case 'D':
123 						enforce(document.skipOver("!DOCTYPE"));
124 						parseDocType();
125 						break;
126 					case '-':
127 						enforce(document.skipOver("--"));
128 						parseComment();
129 						break;
130 					default:
131 						throw new XMLException(this, "Invalid XML tag");
132 				}
133 				break;
134 			case '?':
135 				document.popFront();
136 				parsePI();
137 				break;
138 			case '/':
139 				document.popFront();
140 				parseStartTag();
141 				front.type = XMLNodeType.EndTag;
142 				break;
143 			default:
144 				parseStartTag();
145 				break;
146 		}
147 
148 		skipWhitespace();
149 	}
150 
151 	private void parseCDATA()
152 	{
153 		front.type = XMLNodeType.CData;
154 		auto buff = appender!string();
155 
156 		while ( !document.empty )
157 		{
158 			if ( document.front == ']' )
159 			{
160 				document.popFront();
161 
162 				if ( document.front != ']' )
163 				{
164 					buff.put(']');
165 					buff.put(document.front);
166 					document.popFront();
167 					continue;
168 				}
169 
170 				document.popFront();
171 
172 				if ( document.front == '>' )
173 				{
174 					document.popFront();
175 					return;
176 				}
177 			}
178 
179 			buff.put(document.front);
180 			document.popFront();
181 		}
182 
183 		front.value = buff.data;
184 	}
185 
186 	private void parseDocType()
187 	{
188 		front.type = XMLNodeType.DocType;
189 		auto buff = appender!string();
190 		int bracketCount;
191 
192 		skipWhitespace();
193 
194 		while ( !document.empty )
195 		{
196 			switch ( document.front )
197 			{
198 				case '[':
199 					bracketCount++;
200 					break;
201 				case ']':
202 					bracketCount--;
203 					break;
204 				case '>':
205 					if ( bracketCount == 0 )
206 					{
207 						document.popFront();
208 						return;
209 					}
210 					break;
211 				default: break;
212 			}
213 
214 			buff.put(document.front);
215 			document.popFront();
216 		}
217 
218 		front.value = buff.data.stripRight();
219 	}
220 
221 	private void parseComment()
222 	{
223 		front.type = XMLNodeType.Comment;
224 		auto buff = appender!string();
225 
226 		while ( !document.empty )
227 		{
228 			if ( document.front == '-' )
229 			{
230 				document.popFront();
231 
232 				if ( document.front != '-' )
233 				{
234 					buff.put('-');
235 					buff.put(document.front);
236 					document.popFront();
237 					continue;
238 				}
239 
240 				document.popFront();
241 
242 				if ( document.front == '>' )
243 				{
244 					document.popFront();
245 					return;
246 				}
247 
248 				throw new XMLException(this, "-- not allowed in comments.");
249 			}
250 
251 			buff.put(document.front);
252 			document.popFront();
253 		}
254 
255 		front.value = buff.data.strip();
256 	}
257 
258 	private void parsePI()
259 	{
260 		front.type = XMLNodeType.PI;
261 		auto buff = appender!string();
262 
263 		while ( !document.empty )
264 		{
265 			if ( document.front == '?' )
266 			{
267 				document.popFront();
268 
269 				if ( document.front == '>' )
270 				{
271 					document.popFront();
272 					return;
273 				}
274 
275 				buff.put('?');
276 			}
277 
278 			buff.put(document.front);
279 			document.popFront();
280 		}
281 
282 		front.value = buff.data.stripRight();
283 	}
284 
285 	private void parseStartTag()
286 	{
287 		front.type = XMLNodeType.StartTag;
288 		auto buff = appender!string();
289 
290 		while ( !document.empty && !(document.front.isWhite() || document.front == '/' || document.front == '>') )
291 		{
292 			buff.put(document.front);
293 			document.popFront();
294 		}
295 
296 		front.value = buff.data;
297 
298 		while ( !document.empty )
299 		{
300 			skipWhitespace();
301 
302 			if ( document.front == '/' )
303 			{
304 				front.type = XMLNodeType.EmptyTag;
305 				document.popFront();
306 			}
307 
308 			if ( document.front == '>' )
309 			{
310 				document.popFront();
311 				return;
312 			}
313 
314 			buff = appender!string();
315 			string attName;
316 
317 			while ( !document.empty && !(document.front.isWhite() || document.front == '=') )
318 			{
319 				buff.put(document.front);
320 				document.popFront();
321 			}
322 
323 			document.popFront();
324 			if ( document.front == '=' )
325 				document.popFront();
326 
327 			attName = buff.data;
328 			buff = appender!string();
329 
330 			if ( document.front.isWhite() )
331 				skipWhitespace();
332 
333 			ElementType!(typeof(document)) quote = document.front;
334 			document.popFront();
335 
336 			AttValue: while ( !document.empty )
337 			{
338 				switch ( document.front )
339 				{
340 					case '\'':
341 					case '"':
342 						if ( document.front != quote )
343 							goto default;
344 
345 						document.popFront();
346 						break AttValue;
347 					case '&':
348 						parseAmpersand(buff);
349 						break;
350 					default:
351 						buff.put(document.front);
352 						break;
353 				}
354 
355 				document.popFront();
356 			}
357 
358 			front.attributes[attName] = buff.data;
359 		}
360 	}
361 
362 	private void parseText()
363 	{
364 		front.type = XMLNodeType.Text;
365 		auto buff = appender!string();
366 
367 		Text: while ( !document.empty )
368 		{
369 			switch ( document.front )
370 			{
371 				case '<':
372 					break Text;
373 				case '&':
374 					parseAmpersand(buff);
375 					break;
376 				default:
377 					buff.put(document.front);
378 					break;
379 			}
380 
381 			document.popFront();
382 		}
383 
384 		front.value = buff.data.stripRight();
385 	}
386 
387 	private void skipWhitespace()
388 	{
389 		while ( !document.empty && isWhite(document.front) )
390 			document.popFront();
391 	}
392 
393 	private void parseAmpersand(Appender!(string) buff)
394 	{
395 		ElementType!(typeof(document))[5] sequence;
396 		int index;
397 
398 		document.popFront();
399 
400 		while ( document.front != ';' )
401 		{
402 			sequence[index++] = document.front;
403 			document.popFront();
404 		}
405 
406 		switch ( sequence[0 .. index] )
407 		{
408 			case "#34":
409 			case "quot":
410 				buff.put('"');
411 				break;
412 			case "#38":
413 			case "amp":
414 				buff.put('&');
415 				break;
416 			case "#39":
417 			case "apos":
418 				buff.put('\'');
419 				break;
420 			case "#60":
421 			case "lt":
422 				buff.put('<');
423 				break;
424 			case "#62":
425 			case "gt":
426 				buff.put('>');
427 				break;
428 			case "#x4":
429 				buff.put('\004');
430 				break;
431 			default:
432 				throw new XMLException(this, "Unregonized escape secuence");
433 		}
434 	}
435 
436 	unittest
437 	{
438 		auto reader = new XMLReader("&lt;test&gt;");
439 		assert(reader.front.value == "<test>");
440 	}
441 }
442 
443 /**
444  * Skip the current tag and it's content.
445  * Leaves the reader pointing to the end tag with the same depth.
446  */
447 void skipTag(T)(XMLReader!T reader)
448 {
449 	if ( reader.front.type == XMLNodeType.EmptyTag )
450 		return;
451 	if ( reader.front.type != XMLNodeType.StartTag )
452 	{
453 		reader.popFront();
454 		return;
455 	}
456 
457 	string tagName = reader.front.value;
458 	size_t depth;
459 
460 	while ( !reader.empty )
461 	{
462 		if ( reader.front.type == XMLNodeType.StartTag )
463 			depth++;
464 
465 		if ( reader.front.type == XMLNodeType.EndTag )
466 			depth--;
467 
468 		if ( depth == 0 && reader.front.value == tagName )
469 			return;
470 
471 		reader.popFront();
472 	}
473 }
474 
475 /**
476  * Is this an end tag with name tagName.
477  */
478 bool endTag(T)(XMLReader!T reader, string tagName)
479 {
480 	return reader.front.type == XMLNodeType.EndTag && reader.front.value == tagName;
481 }
482 
483 /// ditto.
484 bool endTag(T)(XMLReader!T reader, string[] tagNames ...)
485 {
486 	return reader.front.type == XMLNodeType.EndTag && tagNames.canFind(reader.front.value);
487 }
488 
489 class XMLException : WrapException
490 {
491 	this (T)(XMLReader!T reader, string msg)
492 	{
493 		super(msg, reader.fileName, reader.line, null);
494 	}
495 
496 	override string toString()
497 	{
498 		string s;
499 		toString((buf) { s ~= buf; });
500 		return s;
501 	}
502 
503 	override void toString(scope void delegate(in char[]) sink) const
504 	{
505 		sink(file);
506 		sink("("); sink(to!string(line)); sink(")");
507 
508 		if (msg.length)
509 		{
510 			sink(": "); sink(msg);
511 		}
512 	}
513 
514 }
515 
516 struct ByChar
517 {
518 	string data;
519 
520 	@property char front()
521 	{
522 		return data[0];
523 	}
524 
525 	@property bool empty()
526 	{
527 		return !data.length;
528 	}
529 
530 	void popFront()
531 	{
532 		assert(data.length, "Attempting to popFront() past the end of an array");
533 		data = data[1 .. $];
534 	}
535 
536 	@property ByChar save()
537 	{
538 		return this;
539 	}
540 
541 	alias data this;
542 }
543 
544 struct CountLines(Source) if (isSomeChar!(ElementType!Source))
545 {
546 	Source src;
547 	size_t line = 1;
548 
549 	this(Source src)
550 	{
551 		this.src = src;
552 	}
553 
554 	@property ElementType!Source front()
555 	{
556 		return src.front;
557 	}
558 
559 	@property bool empty()
560 	{
561 		return src.empty;
562 	}
563 
564 	void popFront()
565 	{
566 		src.popFront();
567 
568 		if ( src.front == '\n' )
569 			line++;
570 	}
571 
572 	@property typeof(this) save()
573 	{
574 		return typeof(this)(src.save);
575 	}
576 }