url source code

1 /**
2 	* A URL handling library.
3 	*
4 	* URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional
5 	* elements like port, path, username, and password.
6 	*
7 	* This module aims to make it simple to muck about with them.
8 	*
9 	* Example usage:
10 	* ---
11 	* auto url = "ssh://me:password@192.168.0.8/".parseURL;
12 	* auto files = system("ssh", url.toString, "ls").splitLines;
13 	* foreach (file; files) {
14 	*		system("scp", url ~ file, ".");
15 	* }
16 	* ---
17 	*
18 	* License: The MIT license.
19 	*/
20 module url;
21 
22 import std.algorithm;
23 import std.array;
24 import std.conv;
25 import std.encoding;
26 import std.string;
27 import std.utf;
28 
29 /// An exception thrown when something bad happens with URLs.
30 class URLException : Exception {
31 	this(string msg) { super(msg); }
32 }
33 
34 /**
35 	* A mapping from schemes to their default ports.
36 	*
37   * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to
38 	* use even if they use ports. Entries here should be treated as best guesses.
39   */
40 ushort[string] schemeToDefaultPort;
41 
42 static this() {
43 	schemeToDefaultPort = [
44 		"aaa": 3868,
45 		"aaas": 5658,
46 		"acap": 674,
47 		"cap": 1026,
48 		"coap": 5683,
49 		"coaps": 5684,
50 		"dav": 443,
51 		"dict": 2628,
52 		"ftp": 21,
53 		"git": 9418,
54 		"go": 1096,
55 		"gopher": 70,
56 		"http": 80,
57 		"https": 443,
58 		"iac": 4569,
59 		"icap": 1344,
60 		"imap": 143,
61 		"ipp": 631,
62 		"ipps": 631,  // yes, they're both mapped to port 631
63 		"irc": 6667,  // De facto default port, not the IANA reserved port.
64 		"ircs": 6697,
65 		"iris": 702,  // defaults to iris.beep
66 		"iris.beep": 702,
67 		"iris.lwz": 715,
68 		"iris.xpc": 713,
69 		"iris.xpcs": 714,
70 		"jabber": 5222,  // client-to-server
71 		"ldap": 389,
72 		"ldaps": 636,
73 		"msrp": 2855,
74 		"msrps": 2855,
75 		"mtqp": 1038,
76 		"mupdate": 3905,
77 		"news": 119,
78 		"nfs": 2049,
79 		"pop": 110,
80 		"redis": 6379,
81 		"reload": 6084,
82 		"rsync": 873,
83 		"rtmfp": 1935,
84 		"rtsp": 554,
85 		"shttp": 80,
86 		"sieve": 4190,
87 		"sip": 5060,
88 		"sips": 5061,
89 		"smb": 445,
90 		"smtp": 25,
91 		"snews": 563,
92 		"snmp": 161,
93 		"soap.beep": 605,
94 		"ssh": 22,
95 		"stun": 3478,
96 		"stuns": 5349,
97 		"svn": 3690,
98 		"teamspeak": 9987,
99 		"telnet": 23,
100 		"tftp": 69,
101 		"tip": 3372,
102 	];
103 }
104 
105 /**
106 	* A Unique Resource Locator.
107 	* 
108 	* URLs can be parsed (see parseURL) and implicitly convert to strings.
109 	*/
110 struct URL {
111 	/// The URL scheme. For instance, ssh, ftp, or https.
112 	string scheme;
113 
114 	/// The username in this URL. Usually absent. If present, there will also be a password.
115 	string user;
116 
117 	/// The password in this URL. Usually absent.
118 	string pass;
119 
120 	/// The hostname.
121 	string host;
122 
123 	/**
124 	  * The port.
125 		*
126 	  * This is inferred from the scheme if it isn't present in the URL itself.
127 	  * If the scheme is not known and the port is not present, the port will be given as 0.
128 	  * For some schemes, port will not be sensible -- for instance, file or chrome-extension.
129 	  *
130 	  * If you explicitly need to detect whether the user provided a port, check the providedPort
131 	  * field.
132 	  */
133 	@property ushort port() {
134 		if (providedPort != 0) {
135 			return providedPort;
136 		}
137 		if (auto p = scheme in schemeToDefaultPort) {
138 			return *p;
139 		}
140 		return 0;
141 	}
142 
143 	/**
144 	  * Set the port.
145 		*
146 		* This sets the providedPort field and is provided for convenience.
147 		*/
148 	@property ushort port(ushort value) {
149 		return providedPort = value;
150 	}
151 
152 	/// The port that was explicitly provided in the URL.
153 	ushort providedPort;
154 
155 	/**
156 	  * The path.
157 	  *
158 	  * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is
159 	  * "/news/story/17774".
160 	  */
161 	string path;
162 
163 	/**
164 	  * The query string elements.
165 	  *
166 	  * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string
167 	  * elements will be ["visited": "false"].
168 	  *
169 	  * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be
170 	  * ["item": ""].
171 	  *
172 	  * This field is mutable, so be cautious.
173 	  */
174 	string[string] query;
175 
176 	/**
177 	  * The fragment. In web documents, this typically refers to an anchor element.
178 	  * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2".
179 	  */
180 	string fragment;
181 
182 	/**
183 	  * Convert this URL to a string.
184 	  * The string is properly formatted and usable for, eg, a web request.
185 	  */
186 	string toString() {
187 		return toString(false);
188 	}
189 
190 	/**
191 		* Convert this URL to a string.
192 		* The string is intended to be human-readable rather than machine-readable.
193 		*/
194 	string toHumanReadableString() {
195 		return toString(true);
196 	}
197 
198 	private string toString(bool humanReadable) {
199 		Appender!string s;
200 		s ~= scheme;
201 		s ~= "://";
202 		if (user) {
203 			s ~= humanReadable ? user : user.percentEncode;
204 			s ~= ":";
205 			s ~= humanReadable ? pass : pass.percentEncode;
206 			s ~= "@";
207 		}
208 		s ~= humanReadable ? host : host.toPuny;
209 		if (providedPort) {
210 			if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) {
211 				s ~= ":";
212 				s ~= providedPort.to!string;
213 			}
214 		}
215 		string p = path;
216 		if (p.length == 0 || p == "/") {
217 			s ~= '/';
218 		} else {
219 			if (p[0] == '/') {
220 				p = p[1..$];
221 			}
222 			if (humanReadable) {
223 				s ~= p;
224 			} else {
225 				foreach (part; p.split('/')) {
226 					s ~= '/';
227 					s ~= part.percentEncode;
228 				}
229 			}
230 		}
231 		if (query) {
232 			s ~= '?';
233 			bool first = true;
234 			foreach (k, v; query) {
235 				if (!first) {
236 					s ~= '&';
237 				}
238 				first = false;
239 				s ~= k.percentEncode;
240 				if (v.length > 0) {
241 					s ~= '=';
242 					s ~= v.percentEncode;
243 				}
244 			}
245 		}
246 		if (fragment) {
247 			s ~= '#';
248 			s ~= fragment.percentEncode;
249 		}
250 		return s.data;
251 	}
252 
253 	/// Implicitly convert URLs to strings.
254 	alias toString this;
255 
256 	/**
257 		* The append operator (~).
258 		*
259 		* The append operator for URLs returns a new URL with the given string appended as a path
260 		* element to the URL's path. It only adds new path elements (or sequences of path elements).
261 		*
262 		* Don't worry about path separators; whether you include them or not, it will just work.
263 		*
264 		* Query elements are copied.
265 		*
266 		* Examples:
267 		* ---
268 		* auto random = "http://testdata.org/random".parseURL;
269 		* auto randInt = random ~ "int";
270 		* writeln(randInt);  // prints "http://testdata.org/random/int"
271 		* ---
272 		*/
273 	URL opBinary(string op : "~")(string subsequentPath) {
274 		URL other = this;
275 		other ~= subsequentPath;
276 		if (query) {
277 			other.query = other.query.dup;
278 		}
279 		return other;
280 	}
281 
282 	/**
283 		* The append-in-place operator (~=).
284 		*
285 		* The append operator for URLs adds a path element to this URL. It only adds new path elements
286 		* (or sequences of path elements).
287 		*
288 		* Don't worry about path separators; whether you include them or not, it will just work.
289 		*
290 		* Examples:
291 		* ---
292 		* auto random = "http://testdata.org/random".parseURL;
293 		* random ~= "int";
294 		* writeln(random);  // prints "http://testdata.org/random/int"
295 		* ---
296 		*/
297 	URL opOpAssign(string op : "~")(string subsequentPath) {
298 		if (path.endsWith("/") || subsequentPath.startsWith("/")) {
299 			if (path.endsWith("/") && subsequentPath.startsWith("/")) {
300 				path ~= subsequentPath[1..$];
301 			} else {
302 				path ~= subsequentPath;
303 			}
304 		} else {
305 			path ~= '/';
306 			path ~= subsequentPath;
307 		}
308 		return this;
309 	}
310 }
311 
312 /**
313 	* Parse a URL from a string.
314 	*
315 	* This attempts to parse a wide range of URLs as people might actually type them. Some mistakes
316 	* may be made. However, any URL in a correct format will be parsed correctly.
317 	*/
318 bool tryParseURL(string value, out URL url) {
319 	url = URL.init;
320 	// scheme:[//[user:password@]host[:port]][/]path[?query][#fragment]
321 	// Scheme is optional in common use. We infer 'http' if it's not given.
322 	auto i = value.indexOf("://");
323 	if (i > -1) {
324 		url.scheme = value[0..i];
325 		value = value[i+3 .. $];
326 	} else {
327 		url.scheme = "http";
328 	}
329 	// [user:password@]host[:port]][/]path[?query][#fragment
330 	i = value.indexOfAny([':', '/']);
331 	if (i == -1) {
332 		// Just a hostname.
333 		url.host = value.fromPuny;
334 		return true;
335 	}
336 
337 	if (value[i] == ':') {
338 		// This could be between username and password, or it could be between host and port.
339 		auto j = value.indexOfAny(['@', '/']);
340 		if (j > -1 && value[j] == '@') {
341 			try {
342 				url.user = value[0..i].percentDecode;
343 				url.pass = value[i+1 .. j].percentDecode;
344 			} catch (URLException) {
345 				return false;
346 			}
347 			value = value[j+1 .. $];
348 		}
349 	}
350 
351 	// It's trying to be a host/port, not a user/pass.
352 	i = value.indexOfAny([':', '/']);
353 	if (i == -1) {
354 		url.host = value.fromPuny;
355 		return true;
356 	}
357 	url.host = value[0..i].fromPuny;
358 	value = value[i .. $];
359 	if (value[0] == ':') {
360 		auto end = value.indexOf('/');
361 		if (end == -1) {
362 			end = value.length;
363 		}
364 		try {
365 			url.port = value[1 .. end].to!ushort;
366 		} catch (ConvException) {
367 			return false;
368 		}
369 		value = value[end .. $];
370 		if (value.length == 0) {
371 			return true;
372 		}
373 	}
374 
375 	i = value.indexOfAny("?#");
376 	if (i == -1) {
377 		url.path = value.percentDecode;
378 		return true;
379 	}
380 
381 	try {
382 		url.path = value[0..i].percentDecode;
383 	} catch (URLException) {
384 		return false;
385 	}
386 	auto c = value[i];
387 	value = value[i + 1 .. $];
388 	if (c == '?') {
389 		i = value.indexOf('#');
390 		string query;
391 		if (i < 0) {
392 			query = value;
393 			value = null;
394 		} else {
395 			query = value[0..i];
396 			value = value[i + 1 .. $];
397 		}
398 		auto queries = query.split('&');
399 		foreach (q; queries) {
400 			auto j = q.indexOf('=');
401 			try {
402 				if (j == -1) {
403 					url.query[q.percentDecode] = "";
404 				} else {
405 					url.query[q[0..j].percentDecode] = q[j + 1 .. $].percentDecode;
406 				}
407 			} catch (URLException) {
408 				return false;
409 			}
410 		}
411 	}
412 
413 	try {
414 		url.fragment = value.percentDecode;
415 	} catch (URLException) {
416 		return false;
417 	}
418 
419 	return true;
420 }
421 
422 ///
423 unittest {
424 	{
425 		// Basic.
426 		URL url;
427 		with (url) {
428 			scheme = "https";
429 			host = "example.org";
430 			path = "/foo/bar";
431 			query["hello"] = "world";
432 			query["gibe"] = "clay";
433 			fragment = "frag";
434 		}
435 		assert(
436 				// Not sure what order it'll come out in.
437 				url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" ||
438 				url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag",
439 				url.toString);
440 	}
441 	{
442 		// Percent encoded.
443 		URL url;
444 		with (url) {
445 			scheme = "https";
446 			host = "example.org";
447 			path = "/f☃o";
448 			query["❄"] = "❀";
449 			query["["] = "]";
450 			fragment = "ş";
451 		}
452 		assert(
453 				// Not sure what order it'll come out in.
454 				url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" ||
455 				url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F",
456 				url.toString);
457 	}
458 	{
459 		// Port, user, pass.
460 		URL url;
461 		with (url) {
462 			scheme = "https";
463 			host = "example.org";
464 			user = "dhasenan";
465 			pass = "itsasecret";
466 			port = 17;
467 		}
468 		assert(
469 				url.toString == "https://dhasenan:itsasecret@example.org:17/",
470 				url.toString);
471 	}
472 	{
473 		// Query with no path.
474 		URL url;
475 		with (url) {
476 			scheme = "https";
477 			host = "example.org";
478 			query["hi"] = "bye";
479 		}
480 		assert(
481 				url.toString == "https://example.org/?hi=bye",
482 				url.toString);
483 	}
484 }
485 
486 unittest {
487 	// Percent decoding.
488 
489 	// http://#:!:@
490 	auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash";
491 	auto url = urlString.parseURL;
492 	assert(url.user == "#");
493 	assert(url.pass == "!:");
494 	assert(url.host == "example.org");
495 	assert(url.path == "/{/}");
496 	assert(url.query[";"] == "");
497 	assert(url.query["&"] == "=");
498 	assert(url.fragment == "#hash");
499 
500 	// Round trip.
501 	assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString);
502 	assert(urlString == urlString.parseURL.toString.parseURL.toString);
503 }
504 
505 unittest {
506 	auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL;
507 	assert(url.host == "☂.☃.org", url.host);
508 }
509 
510 unittest {
511 	auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL;
512 	assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString);
513 	assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString);
514 }
515 
516 unittest {
517 	auto url = "https://☂.☃.org/?hi=bye".parseURL;
518 	assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye");
519 }
520 
521 ///
522 unittest {
523 	// There's an existing path.
524 	auto url = parseURL("http://example.org/foo");
525 	// No slash? Assume it needs a slash.
526 	assert((url ~ "bar").toString == "http://example.org/foo/bar");
527 	// With slash? Don't add another.
528 	assert((url ~ "/bar").toString == "http://example.org/foo/bar");
529 	url ~= "bar";
530 	assert(url.toString == "http://example.org/foo/bar");
531 
532 	// Path already ends with a slash; don't add another.
533 	url = parseURL("http://example.org/foo/");
534 	assert((url ~ "bar").toString == "http://example.org/foo/bar");
535 	// Still don't add one even if you're appending with a slash.
536 	assert((url ~ "/bar").toString == "http://example.org/foo/bar");
537 	url ~= "/bar";
538 	assert(url.toString == "http://example.org/foo/bar");
539 
540 	// No path.
541 	url = parseURL("http://example.org");
542 	assert((url ~ "bar").toString == "http://example.org/bar");
543 	assert((url ~ "/bar").toString == "http://example.org/bar");
544 	url ~= "bar";
545 	assert(url.toString == "http://example.org/bar");
546 
547 	// Path is just a slash.
548 	url = parseURL("http://example.org/");
549 	assert((url ~ "bar").toString == "http://example.org/bar");
550 	assert((url ~ "/bar").toString == "http://example.org/bar");
551 	url ~= "bar";
552 	assert(url.toString == "http://example.org/bar", url.toString);
553 
554 	// No path, just fragment.
555 	url = "ircs://irc.freenode.com/#d".parseURL;
556 	assert(url.toString == "ircs://irc.freenode.com/#d", url.toString);
557 }
558 
559 unittest {
560 	import std.net.curl;
561 	auto url = "http://example.org".parseURL;
562 	assert(is(typeof(std.net.curl.get(url))));
563 }
564 
565 /**
566 	* Parse the input string as a URL.
567 	*
568 	* Throws:
569 	*   URLException if the string was in an incorrect format.
570 	*/
571 URL parseURL(string value) {
572 	URL url;
573 	if (tryParseURL(value, url)) {
574 		return url;
575 	}
576 	throw new URLException("failed to parse URL " ~ value);
577 }
578 
579 ///
580 unittest {
581 	{
582 		// Infer scheme
583 		auto u1 = parseURL("example.org");
584 		assert(u1.scheme == "http");
585 		assert(u1.host == "example.org");
586 		assert(u1.path == "");
587 		assert(u1.port == 80);
588 		assert(u1.providedPort == 0);
589 		assert(u1.fragment == "");
590 	}
591 	{
592 		// Simple host and scheme
593 		auto u1 = parseURL("https://example.org");
594 		assert(u1.scheme == "https");
595 		assert(u1.host == "example.org");
596 		assert(u1.path == "");
597 		assert(u1.port == 443);
598 		assert(u1.providedPort == 0);
599 	}
600 	{
601 		// With path
602 		auto u1 = parseURL("https://example.org/foo/bar");
603 		assert(u1.scheme == "https");
604 		assert(u1.host == "example.org");
605 		assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path);
606 		assert(u1.port == 443);
607 		assert(u1.providedPort == 0);
608 	}
609 	{
610 		// With explicit port
611 		auto u1 = parseURL("https://example.org:1021/foo/bar");
612 		assert(u1.scheme == "https");
613 		assert(u1.host == "example.org");
614 		assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path);
615 		assert(u1.port == 1021);
616 		assert(u1.providedPort == 1021);
617 	}
618 	{
619 		// With user
620 		auto u1 = parseURL("https://bob:secret@example.org/foo/bar");
621 		assert(u1.scheme == "https");
622 		assert(u1.host == "example.org");
623 		assert(u1.path == "/foo/bar");
624 		assert(u1.port == 443);
625 		assert(u1.user == "bob");
626 		assert(u1.pass == "secret");
627 	}
628 	{
629 		// With user, URL-encoded
630 		auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar");
631 		assert(u1.scheme == "https");
632 		assert(u1.host == "example.org");
633 		assert(u1.path == "/foo/bar");
634 		assert(u1.port == 443);
635 		assert(u1.user == "bob!");
636 		assert(u1.pass == "secret!?");
637 	}
638 	{
639 		// With user and port and path
640 		auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar");
641 		assert(u1.scheme == "https");
642 		assert(u1.host == "example.org");
643 		assert(u1.path == "/foo/bar");
644 		assert(u1.port == 2210);
645 		assert(u1.user == "bob");
646 		assert(u1.pass == "secret");
647 		assert(u1.fragment == "");
648 	}
649 	{
650 		// With query string
651 		auto u1 = parseURL("https://example.org/?login=true");
652 		assert(u1.scheme == "https");
653 		assert(u1.host == "example.org");
654 		assert(u1.path == "/", "expected path: / actual path: " ~ u1.path);
655 		assert(u1.query["login"] == "true");
656 		assert(u1.fragment == "");
657 	}
658 	{
659 		// With query string and fragment
660 		auto u1 = parseURL("https://example.org/?login=true#justkidding");
661 		assert(u1.scheme == "https");
662 		assert(u1.host == "example.org");
663 		assert(u1.path == "/", "expected path: / actual path: " ~ u1.path);
664 		assert(u1.query["login"] == "true");
665 		assert(u1.fragment == "justkidding");
666 	}
667 	{
668 		// With URL-encoded values
669 		auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E");
670 		assert(u1.scheme == "https");
671 		assert(u1.host == "example.org");
672 		assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path);
673 		assert(u1.query["❄"] == "=");
674 		assert(u1.fragment == "^");
675 	}
676 }
677 
678 unittest {
679 	assert(parseURL("http://example.org").port == 80);
680 	assert(parseURL("http://example.org:5326").port == 5326);
681 
682 	auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment");
683 	assert(url.scheme == "redis");
684 	assert(url.user == "admin");
685 	assert(url.pass == "password");
686 
687 	assert(parseURL("example.org").toString == "http://example.org/");
688 	assert(parseURL("http://example.org:80").toString == "http://example.org/");
689 
690 	assert(parseURL("localhost:8070").toString == "http://localhost:8070/");
691 }
692 
693 /**
694 	* Percent-encode a string.
695 	*
696 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
697 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
698 	* everything else, there is percent encoding.
699 	*/
700 string percentEncode(string raw) {
701 	// We *must* encode these characters: :/?#[]@!$&'()*+,;="
702 	// We *can* encode any other characters.
703 	// We *should not* encode alpha, numeric, or -._~.
704 	Appender!string app;
705 	foreach (dchar d; raw) {
706 		if (('a' <= d && 'z' >= d) ||
707 				('A' <= d && 'Z' >= d) ||
708 				('0' <= d && '9' >= d) ||
709 				d == '-' || d == '.' || d == '_' || d == '~') {
710 			app ~= d;
711 			continue;
712 		}
713 		// Something simple like a space character? Still in 7-bit ASCII?
714 		// Then we get a single-character string out of it and just encode
715 		// that one bit.
716 		// Something not in 7-bit ASCII? Then we percent-encode each octet
717 		// in the UTF-8 encoding (and hope the server understands UTF-8).
718 		char[] c;
719 		encode(c, d);
720 		auto bytes = cast(ubyte[])c;
721 		foreach (b; bytes) {
722 			app ~= format("%%%02X", b);
723 		}
724 	}
725 	return cast(string)app.data;
726 }
727 
728 ///
729 unittest {
730 	assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding");
731 	assert(percentEncode("~~--..__") == "~~--..__");
732 	assert(percentEncode("0123456789") == "0123456789");
733 
734 	string e;
735 
736 	e = percentEncode("☃");
737 	assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e);
738 }
739 
740 /**
741 	* Percent-decode a string.
742 	*
743 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
744 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
745 	* everything else, there is percent encoding.
746 	*
747 	* This explicitly ensures that the result is a valid UTF-8 string.
748 	*/
749 string percentDecode(string encoded) {
750 	ubyte[] raw = percentDecodeRaw(encoded);
751 	auto s = cast(string) raw;
752 	if (!s.isValid) {
753 		// TODO(dhasenan): 
754 		throw new URLException("input contains invalid UTF data");
755 	}
756 	return s;
757 }
758 
759 ///
760 unittest {
761 	assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding");
762 	assert(percentDecode("~~--..__") == "~~--..__");
763 	assert(percentDecode("0123456789") == "0123456789");
764 
765 	string e;
766 
767 	e = percentDecode("%E2%98%83");
768 	assert(e == "☃", "expected a snowman but got" ~ e);
769 }
770 
771 /**
772 	* Percent-decode a string into a ubyte array.
773 	*
774 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
775 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
776 	* everything else, there is percent encoding.
777 	*
778 	* This yields a ubyte array and will not perform validation on the output. However, an improperly
779 	* formatted input string will result in a URLException.
780 	*/
781 ubyte[] percentDecodeRaw(string encoded) {
782 	// We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now.
783 	Appender!(ubyte[]) app;
784 	for (int i = 0; i < encoded.length; i++) {
785 		if (encoded[i] != '%') {
786 			app ~= encoded[i];
787 			continue;
788 		}
789 		if (i >= encoded.length - 2) {
790 			throw new URLException("Invalid percent encoded value: expected two characters after " ~
791 					"percent symbol. Error at index " ~ i.to!string);
792 		}
793 		auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1]));
794 		auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2]));
795 		app ~= cast(ubyte)((b << 4) | c);
796 		i += 2;
797 	}
798 	return app.data;
799 }
800 
801 private string toPuny(string unicodeHostname) {
802 	bool mustEncode = false;
803 	foreach (i, dchar d; unicodeHostname) {
804 		auto c = cast(uint) d;
805 		if (c > 0x80) {
806 			mustEncode = true;
807 			break;
808 		}
809 		if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) {
810 			throw new URLException(
811 					format(
812 						"domain name '%s' contains illegal character '%s' at position %s",
813 						unicodeHostname, d, i));
814 		}
815 	}
816 	if (!mustEncode) {
817 		return unicodeHostname;
818 	}
819 	return unicodeHostname.split('.').map!punyEncode.join(".");
820 }
821 
822 private string fromPuny(string hostname) {
823 	return hostname.split('.').map!punyDecode.join(".");
824 }
825 
826 private {
827 	enum delimiter = '-';
828 	enum marker = "xn--";
829 	enum ulong damp = 700;
830 	enum ulong tmin = 1;
831 	enum ulong tmax = 26;
832 	enum ulong skew = 38;
833 	enum ulong base = 36;
834 	enum ulong initialBias = 72;
835 	enum dchar initialN = cast(dchar)128;
836 
837 	ulong adapt(ulong delta, ulong numPoints, bool firstTime) {
838 		if (firstTime) {
839 			delta /= damp;
840 		} else {
841 			delta /= 2;
842 		}
843 		delta += delta / numPoints;
844 		ulong k = 0;
845 		while (delta > ((base - tmin) * tmax) / 2) {
846 			delta /= (base - tmin);
847 			k += base;
848 		}
849 		return k + (((base - tmin + 1) * delta) / (delta + skew));
850 	}
851 }
852 
853 /**
854 	* Encode the input string using the Punycode algorithm.
855 	*
856 	* Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked
857 	* with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com"
858 	* in Punycode, you will get "xn--m3h.xn--n3h.com".
859 	*
860 	* In order to puny-encode a domain name, you must split it into its components. The following will
861 	* typically suffice:
862 	* ---
863 	* auto domain = "☂.☃.com";
864 	* auto encodedDomain = domain.splitter(".").map!(punyEncode).join(".");
865 	* ---
866 	*/
867 string punyEncode(string input) {
868 	ulong delta = 0;
869 	dchar n = initialN;
870 	auto i = 0;
871 	auto bias = initialBias;
872 	Appender!string output;
873 	output ~= marker;
874 	auto pushed = 0;
875 	auto codePoints = 0;
876 	foreach (dchar c; input) {
877 		codePoints++;
878 		if (c <= initialN) {
879 			output ~= c;
880 			pushed++;
881 		}
882 	}
883 	if (pushed < codePoints) {
884 		if (pushed > 0) {
885 			output ~= delimiter;
886 		}
887 	} else {
888 		// No encoding to do.
889 		return input;
890 	}
891 	bool first = true;
892 	while (pushed < codePoints) {
893 		auto best = dchar.max;
894 		foreach (dchar c; input) {
895 			if (n <= c && c < best) {
896 				best = c;
897 			}
898 		}
899 		if (best == dchar.max) {
900 			throw new URLException("failed to find a new codepoint to process during punyencode");
901 		}
902 		delta += (best - n) * (pushed + 1);
903 		if (delta > uint.max) {
904 			// TODO better error message
905 			throw new URLException("overflow during punyencode");
906 		}
907 		n = best;
908 		foreach (dchar c; input) {
909 			if (c < n) {
910 				delta++;
911 			}
912 			if (c == n) {
913 				ulong q = delta;
914 				auto k = base;
915 				while (true) {
916 					ulong t;
917 					if (k <= bias) {
918 						t = tmin;
919 					} else if (k >= bias + tmax) {
920 						t = tmax;
921 					} else {
922 						t = k - bias;
923 					}
924 					if (q < t) {
925 						break;
926 					}
927 					output ~= digitToBasic(t + ((q - t) % (base - t)));
928 					q = (q - t) / (base - t);
929 					k += base;
930 				}
931 				output ~= digitToBasic(q);
932 				pushed++;
933 				bias = adapt(delta, pushed, first);
934 				first = false;
935 				delta = 0;
936 			}
937 		}
938 		delta++;
939 		n++;
940 	}
941 	return cast(string)output.data;
942 }
943 
944 /**
945 	* Decode the input string using the Punycode algorithm.
946 	*
947 	* Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked
948 	* with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com"
949 	* in Punycode, you will get "xn--m3h.xn--n3h.com".
950 	*
951 	* In order to puny-decode a domain name, you must split it into its components. The following will
952 	* typically suffice:
953 	* ---
954 	* auto domain = "xn--m3h.xn--n3h.com";
955 	* auto decodedDomain = domain.splitter(".").map!(punyDecode).join(".");
956 	* ---
957 	*/
958 string punyDecode(string input) {
959 	if (!input.startsWith(marker)) {
960 		return input;
961 	}
962 	input = input[marker.length..$];
963 
964  	// let n = initial_n
965 	dchar n = cast(dchar)128;
966 
967  	// let i = 0
968  	// let bias = initial_bias
969  	// let output = an empty string indexed from 0
970 	ulong i = 0;
971 	auto bias = initialBias;
972 	dchar[] output;
973 	// This reserves a bit more than necessary, but it should be more efficient overall than just
974 	// appending and inserting volo-nolo.
975 	output.reserve(input.length);
976 
977  	// consume all code points before the last delimiter (if there is one)
978  	//   and copy them to output, fail on any non-basic code point
979  	// if more than zero code points were consumed then consume one more
980  	//   (which will be the last delimiter)
981 	auto end = input.lastIndexOf(delimiter);
982 	if (end > -1) {
983 		foreach (dchar c; input[0..end]) {
984 			output ~= c;
985 		}
986 		input = input[end+1 .. $];
987 	}
988 
989  	// while the input is not exhausted do begin
990 	ulong pos = 0;
991 	while (pos < input.length) {
992  	//   let oldi = i
993  	//   let w = 1
994 		auto oldi = i;
995 		auto w = 1;
996  	//   for k = base to infinity in steps of base do begin
997 		for (ulong k = base; k < uint.max; k += base) {
998  	//     consume a code point, or fail if there was none to consume
999 			// Note that the input is all ASCII, so we can simply index the input string bytewise.
1000 			auto c = input[pos];
1001 			pos++;
1002  	//     let digit = the code point's digit-value, fail if it has none
1003 			auto digit = basicToDigit(c);
1004  	//     let i = i + digit * w, fail on overflow
1005 			i += digit * w;
1006  	//     let t = tmin if k <= bias {+ tmin}, or
1007  	//             tmax if k >= bias + tmax, or k - bias otherwise
1008 			ulong t;
1009 			if (k <= bias) {
1010 				t = tmin;
1011 			} else if (k >= bias + tmax) {
1012 				t = tmax;
1013 			} else {
1014 				t = k - bias;
1015 			}
1016  	//     if digit < t then break
1017 			if (digit < t) {
1018 				break;
1019 			}
1020  	//     let w = w * (base - t), fail on overflow
1021 			w *= (base - t);
1022  	//   end
1023 		}
1024  	//   let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
1025 		bias = adapt(i - oldi, output.length + 1, oldi == 0);
1026  	//   let n = n + i div (length(output) + 1), fail on overflow
1027 		n += i / (output.length + 1);
1028  	//   let i = i mod (length(output) + 1)
1029 		i %= (output.length + 1);
1030  	//   {if n is a basic code point then fail}
1031 		// (We aren't actually going to fail here; it's clear what this means.)
1032  	//   insert n into output at position i
1033 		output.insertInPlace(i, cast(dchar)n);
1034  	//   increment i
1035 		i++;
1036  	// end
1037 	}
1038 	return output.to!string;
1039 }
1040 
1041 // Lifted from punycode.js.
1042 private dchar digitToBasic(ulong digit) {
1043 	return cast(dchar)(digit + 22 + 75 * (digit < 26));
1044 }
1045 
1046 // Lifted from punycode.js.
1047 private uint basicToDigit(char c) {
1048 	auto codePoint = cast(uint)c;
1049 	if (codePoint - 48 < 10) {
1050 		return codePoint - 22;
1051 	}
1052 	if (codePoint - 65 < 26) {
1053 		return codePoint - 65;
1054 	}
1055 	if (codePoint - 97 < 26) {
1056 		return codePoint - 97;
1057 	}
1058 	return base;
1059 }
1060 
1061 unittest {
1062 	{
1063 		auto a = "b\u00FCcher";
1064 		assert(punyEncode(a) == "xn--bcher-kva");
1065 	}
1066 	{
1067 		auto a = "b\u00FCc\u00FCher";
1068 		assert(punyEncode(a) == "xn--bcher-kvab");
1069 	}
1070 	{
1071 		auto a = "ýbücher";
1072 		auto b = punyEncode(a);
1073 		assert(b == "xn--bcher-kvaf", b);
1074 	}
1075 
1076 	{
1077 		auto a = "mañana";
1078 		assert(punyEncode(a) == "xn--maana-pta");
1079 	}
1080 
1081 	{
1082 		auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1083 			~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F";
1084 		auto b = punyEncode(a);
1085 		assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b);
1086 	}
1087 	import std.stdio;
1088 }
1089 
1090 unittest {
1091 	{
1092 		auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn");
1093 		assert(b == "ليهمابتكلموشعربي؟", b);
1094 	}
1095 	{
1096 		assert(punyDecode("xn--maana-pta") == "mañana");
1097 	}
1098 }
1099 
1100 unittest {
1101 	import std.string, std.algorithm, std.array, std.range;
1102 	{
1103 		auto domain = "xn--m3h.xn--n3h.com";
1104 		auto decodedDomain = domain.splitter(".").map!(punyDecode).join(".");
1105 		assert(decodedDomain == "☂.☃.com", decodedDomain);
1106 	}
1107 	{
1108 		auto domain = "☂.☃.com";
1109 		auto decodedDomain = domain.splitter(".").map!(punyEncode).join(".");
1110 		assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain);
1111 	}
1112 }