1 /**
2 	* A URL handling library.
3 	*
4 	* URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional
5 	* elements like port, path, username, and password.
6 	*
7 	* This module aims to make it simple to muck about with them.
8 	*
9 	* Example usage:
10 	* ---
11 	* auto url = "ssh://me:password@192.168.0.8/".parseURL;
12 	* auto files = system("ssh", url.toString, "ls").splitLines;
13 	* foreach (file; files) {
14 	*		system("scp", url ~ file, ".");
15 	* }
16 	* ---
17 	*
18 	* License: The MIT license.
19 	*/
20 module url;
21 
22 import std.algorithm;
23 import std.array;
24 import std.conv;
25 import std.encoding;
26 import std..string;
27 import std.utf;
28 
29 @safe:
30 
31 /// An exception thrown when something bad happens with URLs.
32 class URLException : Exception {
33 	this(string msg) { super(msg); }
34 }
35 
36 /**
37 	* A mapping from schemes to their default ports.
38 	*
39   * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to
40 	* use even if they use ports. Entries here should be treated as best guesses.
41   */
42 ushort[string] schemeToDefaultPort;
43 
44 static this() {
45 	schemeToDefaultPort = [
46 		"aaa": 3868,
47 		"aaas": 5658,
48 		"acap": 674,
49 		"amqp": 5672,
50 		"cap": 1026,
51 		"coap": 5683,
52 		"coaps": 5684,
53 		"dav": 443,
54 		"dict": 2628,
55 		"ftp": 21,
56 		"git": 9418,
57 		"go": 1096,
58 		"gopher": 70,
59 		"http": 80,
60 		"https": 443,
61 		"iac": 4569,
62 		"icap": 1344,
63 		"imap": 143,
64 		"ipp": 631,
65 		"ipps": 631,  // yes, they're both mapped to port 631
66 		"irc": 6667,  // De facto default port, not the IANA reserved port.
67 		"ircs": 6697,
68 		"iris": 702,  // defaults to iris.beep
69 		"iris.beep": 702,
70 		"iris.lwz": 715,
71 		"iris.xpc": 713,
72 		"iris.xpcs": 714,
73 		"jabber": 5222,  // client-to-server
74 		"ldap": 389,
75 		"ldaps": 636,
76 		"msrp": 2855,
77 		"msrps": 2855,
78 		"mtqp": 1038,
79 		"mupdate": 3905,
80 		"news": 119,
81 		"nfs": 2049,
82 		"pop": 110,
83 		"redis": 6379,
84 		"reload": 6084,
85 		"rsync": 873,
86 		"rtmfp": 1935,
87 		"rtsp": 554,
88 		"shttp": 80,
89 		"sieve": 4190,
90 		"sip": 5060,
91 		"sips": 5061,
92 		"smb": 445,
93 		"smtp": 25,
94 		"snews": 563,
95 		"snmp": 161,
96 		"soap.beep": 605,
97 		"ssh": 22,
98 		"stun": 3478,
99 		"stuns": 5349,
100 		"svn": 3690,
101 		"teamspeak": 9987,
102 		"telnet": 23,
103 		"tftp": 69,
104 		"tip": 3372,
105 	];
106 }
107 
108 /**
109 	* A Unique Resource Locator.
110 	* 
111 	* URLs can be parsed (see parseURL) and implicitly convert to strings.
112 	*/
113 struct URL {
114 	/// The URL scheme. For instance, ssh, ftp, or https.
115 	string scheme;
116 
117 	/// The username in this URL. Usually absent. If present, there will also be a password.
118 	string user;
119 
120 	/// The password in this URL. Usually absent.
121 	string pass;
122 
123 	/// The hostname.
124 	string host;
125 
126 	/**
127 	  * The port.
128 		*
129 	  * This is inferred from the scheme if it isn't present in the URL itself.
130 	  * If the scheme is not known and the port is not present, the port will be given as 0.
131 	  * For some schemes, port will not be sensible -- for instance, file or chrome-extension.
132 	  *
133 	  * If you explicitly need to detect whether the user provided a port, check the providedPort
134 	  * field.
135 	  */
136 	@property ushort port() {
137 		if (providedPort != 0) {
138 			return providedPort;
139 		}
140 		if (auto p = scheme in schemeToDefaultPort) {
141 			return *p;
142 		}
143 		return 0;
144 	}
145 
146 	/**
147 	  * Set the port.
148 		*
149 		* This sets the providedPort field and is provided for convenience.
150 		*/
151 	@property ushort port(ushort value) {
152 		return providedPort = value;
153 	}
154 
155 	/// The port that was explicitly provided in the URL.
156 	ushort providedPort;
157 
158 	/**
159 	  * The path.
160 	  *
161 	  * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is
162 	  * "/news/story/17774".
163 	  */
164 	string path;
165 
166 	/**
167 	  * The query string elements.
168 	  *
169 	  * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string
170 	  * elements will be ["visited": "false"].
171 	  *
172 	  * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be
173 	  * ["item": ""].
174 	  *
175 	  * This field is mutable, so be cautious.
176 	  */
177 	string[string] query;
178 
179 	/**
180 	  * The fragment. In web documents, this typically refers to an anchor element.
181 	  * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2".
182 	  */
183 	string fragment;
184 
185 	/**
186 	  * Convert this URL to a string.
187 	  * The string is properly formatted and usable for, eg, a web request.
188 	  */
189 	string toString() {
190 		return toString(false);
191 	}
192 
193 	/**
194 		* Convert this URL to a string.
195 		* The string is intended to be human-readable rather than machine-readable.
196 		*/
197 	string toHumanReadableString() {
198 		return toString(true);
199 	}
200 
201 	private string toString(bool humanReadable) {
202 		Appender!string s;
203 		s ~= scheme;
204 		s ~= "://";
205 		if (user) {
206 			s ~= humanReadable ? user : user.percentEncode;
207 			s ~= ":";
208 			s ~= humanReadable ? pass : pass.percentEncode;
209 			s ~= "@";
210 		}
211 		s ~= humanReadable ? host : host.toPuny;
212 		if (providedPort) {
213 			if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) {
214 				s ~= ":";
215 				s ~= providedPort.to!string;
216 			}
217 		}
218 		string p = path;
219 		if (p.length == 0 || p == "/") {
220 			s ~= '/';
221 		} else {
222 			if (p[0] == '/') {
223 				p = p[1..$];
224 			}
225 			if (humanReadable) {
226 				s ~= p;
227 			} else {
228 				foreach (part; p.split('/')) {
229 					s ~= '/';
230 					s ~= part.percentEncode;
231 				}
232 			}
233 		}
234 		if (query) {
235 			s ~= '?';
236 			bool first = true;
237 			foreach (k, v; query) {
238 				if (!first) {
239 					s ~= '&';
240 				}
241 				first = false;
242 				s ~= k.percentEncode;
243 				if (v.length > 0) {
244 					s ~= '=';
245 					s ~= v.percentEncode;
246 				}
247 			}
248 		}
249 		if (fragment) {
250 			s ~= '#';
251 			s ~= fragment.percentEncode;
252 		}
253 		return s.data;
254 	}
255 
256 	/// Implicitly convert URLs to strings.
257 	alias toString this;
258 
259 	/**
260 		* The append operator (~).
261 		*
262 		* The append operator for URLs returns a new URL with the given string appended as a path
263 		* element to the URL's path. It only adds new path elements (or sequences of path elements).
264 		*
265 		* Don't worry about path separators; whether you include them or not, it will just work.
266 		*
267 		* Query elements are copied.
268 		*
269 		* Examples:
270 		* ---
271 		* auto random = "http://testdata.org/random".parseURL;
272 		* auto randInt = random ~ "int";
273 		* writeln(randInt);  // prints "http://testdata.org/random/int"
274 		* ---
275 		*/
276 	URL opBinary(string op : "~")(string subsequentPath) {
277 		URL other = this;
278 		other ~= subsequentPath;
279 		if (query) {
280 			other.query = other.query.dup;
281 		}
282 		return other;
283 	}
284 
285 	/**
286 		* The append-in-place operator (~=).
287 		*
288 		* The append operator for URLs adds a path element to this URL. It only adds new path elements
289 		* (or sequences of path elements).
290 		*
291 		* Don't worry about path separators; whether you include them or not, it will just work.
292 		*
293 		* Examples:
294 		* ---
295 		* auto random = "http://testdata.org/random".parseURL;
296 		* random ~= "int";
297 		* writeln(random);  // prints "http://testdata.org/random/int"
298 		* ---
299 		*/
300 	URL opOpAssign(string op : "~")(string subsequentPath) {
301 		if (path.endsWith("/") || subsequentPath.startsWith("/")) {
302 			if (path.endsWith("/") && subsequentPath.startsWith("/")) {
303 				path ~= subsequentPath[1..$];
304 			} else {
305 				path ~= subsequentPath;
306 			}
307 		} else {
308 			path ~= '/';
309 			path ~= subsequentPath;
310 		}
311 		return this;
312 	}
313 }
314 
315 /**
316 	* Parse a URL from a string.
317 	*
318 	* This attempts to parse a wide range of URLs as people might actually type them. Some mistakes
319 	* may be made. However, any URL in a correct format will be parsed correctly.
320 	*/
321 bool tryParseURL(string value, out URL url) {
322 	url = URL.init;
323 	// scheme:[//[user:password@]host[:port]][/]path[?query][#fragment]
324 	// Scheme is optional in common use. We infer 'http' if it's not given.
325 	auto i = value.indexOf("://");
326 	if (i > -1) {
327 		url.scheme = value[0..i];
328 		value = value[i+3 .. $];
329 	} else {
330 		url.scheme = "http";
331 	}
332 	// [user:password@]host[:port]][/]path[?query][#fragment
333 	i = value.indexOfAny([':', '/']);
334 	if (i == -1) {
335 		// Just a hostname.
336 		url.host = value.fromPuny;
337 		return true;
338 	}
339 
340 	if (value[i] == ':') {
341 		// This could be between username and password, or it could be between host and port.
342 		auto j = value.indexOfAny(['@', '/']);
343 		if (j > -1 && value[j] == '@') {
344 			try {
345 				url.user = value[0..i].percentDecode;
346 				url.pass = value[i+1 .. j].percentDecode;
347 			} catch (URLException) {
348 				return false;
349 			}
350 			value = value[j+1 .. $];
351 		}
352 	}
353 
354 	// It's trying to be a host/port, not a user/pass.
355 	i = value.indexOfAny([':', '/']);
356 	if (i == -1) {
357 		url.host = value.fromPuny;
358 		return true;
359 	}
360 	url.host = value[0..i].fromPuny;
361 	value = value[i .. $];
362 	if (value[0] == ':') {
363 		auto end = value.indexOf('/');
364 		if (end == -1) {
365 			end = value.length;
366 		}
367 		try {
368 			url.port = value[1 .. end].to!ushort;
369 		} catch (ConvException) {
370 			return false;
371 		}
372 		value = value[end .. $];
373 		if (value.length == 0) {
374 			return true;
375 		}
376 	}
377 
378 	i = value.indexOfAny("?#");
379 	if (i == -1) {
380 		url.path = value.percentDecode;
381 		return true;
382 	}
383 
384 	try {
385 		url.path = value[0..i].percentDecode;
386 	} catch (URLException) {
387 		return false;
388 	}
389 	auto c = value[i];
390 	value = value[i + 1 .. $];
391 	if (c == '?') {
392 		i = value.indexOf('#');
393 		string query;
394 		if (i < 0) {
395 			query = value;
396 			value = null;
397 		} else {
398 			query = value[0..i];
399 			value = value[i + 1 .. $];
400 		}
401 		auto queries = query.split('&');
402 		foreach (q; queries) {
403 			auto j = q.indexOf('=');
404 			try {
405 				if (j == -1) {
406 					url.query[q.percentDecode] = "";
407 				} else {
408 					url.query[q[0..j].percentDecode] = q[j + 1 .. $].percentDecode;
409 				}
410 			} catch (URLException) {
411 				return false;
412 			}
413 		}
414 	}
415 
416 	try {
417 		url.fragment = value.percentDecode;
418 	} catch (URLException) {
419 		return false;
420 	}
421 
422 	return true;
423 }
424 
425 ///
426 unittest {
427 	{
428 		// Basic.
429 		URL url;
430 		with (url) {
431 			scheme = "https";
432 			host = "example.org";
433 			path = "/foo/bar";
434 			query["hello"] = "world";
435 			query["gibe"] = "clay";
436 			fragment = "frag";
437 		}
438 		assert(
439 				// Not sure what order it'll come out in.
440 				url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" ||
441 				url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag",
442 				url.toString);
443 	}
444 	{
445 		// Percent encoded.
446 		URL url;
447 		with (url) {
448 			scheme = "https";
449 			host = "example.org";
450 			path = "/f☃o";
451 			query["❄"] = "❀";
452 			query["["] = "]";
453 			fragment = "ş";
454 		}
455 		assert(
456 				// Not sure what order it'll come out in.
457 				url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" ||
458 				url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F",
459 				url.toString);
460 	}
461 	{
462 		// Port, user, pass.
463 		URL url;
464 		with (url) {
465 			scheme = "https";
466 			host = "example.org";
467 			user = "dhasenan";
468 			pass = "itsasecret";
469 			port = 17;
470 		}
471 		assert(
472 				url.toString == "https://dhasenan:itsasecret@example.org:17/",
473 				url.toString);
474 	}
475 	{
476 		// Query with no path.
477 		URL url;
478 		with (url) {
479 			scheme = "https";
480 			host = "example.org";
481 			query["hi"] = "bye";
482 		}
483 		assert(
484 				url.toString == "https://example.org/?hi=bye",
485 				url.toString);
486 	}
487 }
488 
489 unittest {
490 	// Percent decoding.
491 
492 	// http://#:!:@
493 	auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash";
494 	auto url = urlString.parseURL;
495 	assert(url.user == "#");
496 	assert(url.pass == "!:");
497 	assert(url.host == "example.org");
498 	assert(url.path == "/{/}");
499 	assert(url.query[";"] == "");
500 	assert(url.query["&"] == "=");
501 	assert(url.fragment == "#hash");
502 
503 	// Round trip.
504 	assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString);
505 	assert(urlString == urlString.parseURL.toString.parseURL.toString);
506 }
507 
508 unittest {
509 	auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL;
510 	assert(url.host == "☂.☃.org", url.host);
511 }
512 
513 unittest {
514 	auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL;
515 	assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString);
516 	assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString);
517 }
518 
519 unittest {
520 	auto url = "https://☂.☃.org/?hi=bye".parseURL;
521 	assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye");
522 }
523 
524 ///
525 unittest {
526 	// There's an existing path.
527 	auto url = parseURL("http://example.org/foo");
528 	// No slash? Assume it needs a slash.
529 	assert((url ~ "bar").toString == "http://example.org/foo/bar");
530 	// With slash? Don't add another.
531 	assert((url ~ "/bar").toString == "http://example.org/foo/bar");
532 	url ~= "bar";
533 	assert(url.toString == "http://example.org/foo/bar");
534 
535 	// Path already ends with a slash; don't add another.
536 	url = parseURL("http://example.org/foo/");
537 	assert((url ~ "bar").toString == "http://example.org/foo/bar");
538 	// Still don't add one even if you're appending with a slash.
539 	assert((url ~ "/bar").toString == "http://example.org/foo/bar");
540 	url ~= "/bar";
541 	assert(url.toString == "http://example.org/foo/bar");
542 
543 	// No path.
544 	url = parseURL("http://example.org");
545 	assert((url ~ "bar").toString == "http://example.org/bar");
546 	assert((url ~ "/bar").toString == "http://example.org/bar");
547 	url ~= "bar";
548 	assert(url.toString == "http://example.org/bar");
549 
550 	// Path is just a slash.
551 	url = parseURL("http://example.org/");
552 	assert((url ~ "bar").toString == "http://example.org/bar");
553 	assert((url ~ "/bar").toString == "http://example.org/bar");
554 	url ~= "bar";
555 	assert(url.toString == "http://example.org/bar", url.toString);
556 
557 	// No path, just fragment.
558 	url = "ircs://irc.freenode.com/#d".parseURL;
559 	assert(url.toString == "ircs://irc.freenode.com/#d", url.toString);
560 }
561 
562 unittest {
563 	import std.net.curl;
564 	auto url = "http://example.org".parseURL;
565 	assert(is(typeof(std.net.curl.get(url))));
566 }
567 
568 /**
569 	* Parse the input string as a URL.
570 	*
571 	* Throws:
572 	*   URLException if the string was in an incorrect format.
573 	*/
574 URL parseURL(string value) {
575 	URL url;
576 	if (tryParseURL(value, url)) {
577 		return url;
578 	}
579 	throw new URLException("failed to parse URL " ~ value);
580 }
581 
582 ///
583 unittest {
584 	{
585 		// Infer scheme
586 		auto u1 = parseURL("example.org");
587 		assert(u1.scheme == "http");
588 		assert(u1.host == "example.org");
589 		assert(u1.path == "");
590 		assert(u1.port == 80);
591 		assert(u1.providedPort == 0);
592 		assert(u1.fragment == "");
593 	}
594 	{
595 		// Simple host and scheme
596 		auto u1 = parseURL("https://example.org");
597 		assert(u1.scheme == "https");
598 		assert(u1.host == "example.org");
599 		assert(u1.path == "");
600 		assert(u1.port == 443);
601 		assert(u1.providedPort == 0);
602 	}
603 	{
604 		// With path
605 		auto u1 = parseURL("https://example.org/foo/bar");
606 		assert(u1.scheme == "https");
607 		assert(u1.host == "example.org");
608 		assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path);
609 		assert(u1.port == 443);
610 		assert(u1.providedPort == 0);
611 	}
612 	{
613 		// With explicit port
614 		auto u1 = parseURL("https://example.org:1021/foo/bar");
615 		assert(u1.scheme == "https");
616 		assert(u1.host == "example.org");
617 		assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path);
618 		assert(u1.port == 1021);
619 		assert(u1.providedPort == 1021);
620 	}
621 	{
622 		// With user
623 		auto u1 = parseURL("https://bob:secret@example.org/foo/bar");
624 		assert(u1.scheme == "https");
625 		assert(u1.host == "example.org");
626 		assert(u1.path == "/foo/bar");
627 		assert(u1.port == 443);
628 		assert(u1.user == "bob");
629 		assert(u1.pass == "secret");
630 	}
631 	{
632 		// With user, URL-encoded
633 		auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar");
634 		assert(u1.scheme == "https");
635 		assert(u1.host == "example.org");
636 		assert(u1.path == "/foo/bar");
637 		assert(u1.port == 443);
638 		assert(u1.user == "bob!");
639 		assert(u1.pass == "secret!?");
640 	}
641 	{
642 		// With user and port and path
643 		auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar");
644 		assert(u1.scheme == "https");
645 		assert(u1.host == "example.org");
646 		assert(u1.path == "/foo/bar");
647 		assert(u1.port == 2210);
648 		assert(u1.user == "bob");
649 		assert(u1.pass == "secret");
650 		assert(u1.fragment == "");
651 	}
652 	{
653 		// With query string
654 		auto u1 = parseURL("https://example.org/?login=true");
655 		assert(u1.scheme == "https");
656 		assert(u1.host == "example.org");
657 		assert(u1.path == "/", "expected path: / actual path: " ~ u1.path);
658 		assert(u1.query["login"] == "true");
659 		assert(u1.fragment == "");
660 	}
661 	{
662 		// With query string and fragment
663 		auto u1 = parseURL("https://example.org/?login=true#justkidding");
664 		assert(u1.scheme == "https");
665 		assert(u1.host == "example.org");
666 		assert(u1.path == "/", "expected path: / actual path: " ~ u1.path);
667 		assert(u1.query["login"] == "true");
668 		assert(u1.fragment == "justkidding");
669 	}
670 	{
671 		// With URL-encoded values
672 		auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E");
673 		assert(u1.scheme == "https");
674 		assert(u1.host == "example.org");
675 		assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path);
676 		assert(u1.query["❄"] == "=");
677 		assert(u1.fragment == "^");
678 	}
679 }
680 
681 unittest {
682 	assert(parseURL("http://example.org").port == 80);
683 	assert(parseURL("http://example.org:5326").port == 5326);
684 
685 	auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment");
686 	assert(url.scheme == "redis");
687 	assert(url.user == "admin");
688 	assert(url.pass == "password");
689 
690 	assert(parseURL("example.org").toString == "http://example.org/");
691 	assert(parseURL("http://example.org:80").toString == "http://example.org/");
692 
693 	assert(parseURL("localhost:8070").toString == "http://localhost:8070/");
694 }
695 
696 /**
697 	* Percent-encode a string.
698 	*
699 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
700 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
701 	* everything else, there is percent encoding.
702 	*/
703 string percentEncode(string raw) {
704 	// We *must* encode these characters: :/?#[]@!$&'()*+,;="
705 	// We *can* encode any other characters.
706 	// We *should not* encode alpha, numeric, or -._~.
707 	Appender!string app;
708 	foreach (dchar d; raw) {
709 		if (('a' <= d && 'z' >= d) ||
710 				('A' <= d && 'Z' >= d) ||
711 				('0' <= d && '9' >= d) ||
712 				d == '-' || d == '.' || d == '_' || d == '~') {
713 			app ~= d;
714 			continue;
715 		}
716 		// Something simple like a space character? Still in 7-bit ASCII?
717 		// Then we get a single-character string out of it and just encode
718 		// that one bit.
719 		// Something not in 7-bit ASCII? Then we percent-encode each octet
720 		// in the UTF-8 encoding (and hope the server understands UTF-8).
721 		char[] c;
722 		encode(c, d);
723 		auto bytes = cast(ubyte[])c;
724 		foreach (b; bytes) {
725 			app ~= format("%%%02X", b);
726 		}
727 	}
728 	return cast(string)app.data;
729 }
730 
731 ///
732 unittest {
733 	assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding");
734 	assert(percentEncode("~~--..__") == "~~--..__");
735 	assert(percentEncode("0123456789") == "0123456789");
736 
737 	string e;
738 
739 	e = percentEncode("☃");
740 	assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e);
741 }
742 
743 /**
744 	* Percent-decode a string.
745 	*
746 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
747 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
748 	* everything else, there is percent encoding.
749 	*
750 	* This explicitly ensures that the result is a valid UTF-8 string.
751 	*/
752 @trusted string percentDecode(string encoded) {
753 	ubyte[] raw = percentDecodeRaw(encoded);
754 	auto s = cast(string) raw;
755 	if (!s.isValid) {
756 		// TODO(dhasenan): 
757 		throw new URLException("input contains invalid UTF data");
758 	}
759 	return s;
760 }
761 
762 ///
763 unittest {
764 	assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding");
765 	assert(percentDecode("~~--..__") == "~~--..__");
766 	assert(percentDecode("0123456789") == "0123456789");
767 
768 	string e;
769 
770 	e = percentDecode("%E2%98%83");
771 	assert(e == "☃", "expected a snowman but got" ~ e);
772 }
773 
774 /**
775 	* Percent-decode a string into a ubyte array.
776 	*
777 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
778 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
779 	* everything else, there is percent encoding.
780 	*
781 	* This yields a ubyte array and will not perform validation on the output. However, an improperly
782 	* formatted input string will result in a URLException.
783 	*/
784 ubyte[] percentDecodeRaw(string encoded) {
785 	// We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now.
786 	Appender!(ubyte[]) app;
787 	for (int i = 0; i < encoded.length; i++) {
788 		if (encoded[i] != '%') {
789 			app ~= encoded[i];
790 			continue;
791 		}
792 		if (i >= encoded.length - 2) {
793 			throw new URLException("Invalid percent encoded value: expected two characters after " ~
794 					"percent symbol. Error at index " ~ i.to!string);
795 		}
796 		auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1]));
797 		auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2]));
798 		app ~= cast(ubyte)((b << 4) | c);
799 		i += 2;
800 	}
801 	return app.data;
802 }
803 
804 private string toPuny(string unicodeHostname) {
805 	bool mustEncode = false;
806 	foreach (i, dchar d; unicodeHostname) {
807 		auto c = cast(uint) d;
808 		if (c > 0x80) {
809 			mustEncode = true;
810 			break;
811 		}
812 		if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) {
813 			throw new URLException(
814 					format(
815 						"domain name '%s' contains illegal character '%s' at position %s",
816 						unicodeHostname, d, i));
817 		}
818 	}
819 	if (!mustEncode) {
820 		return unicodeHostname;
821 	}
822 	return unicodeHostname.split('.').map!punyEncode.join(".");
823 }
824 
825 private string fromPuny(string hostname) {
826 	return hostname.split('.').map!punyDecode.join(".");
827 }
828 
829 private {
830 	enum delimiter = '-';
831 	enum marker = "xn--";
832 	enum ulong damp = 700;
833 	enum ulong tmin = 1;
834 	enum ulong tmax = 26;
835 	enum ulong skew = 38;
836 	enum ulong base = 36;
837 	enum ulong initialBias = 72;
838 	enum dchar initialN = cast(dchar)128;
839 
840 	ulong adapt(ulong delta, ulong numPoints, bool firstTime) {
841 		if (firstTime) {
842 			delta /= damp;
843 		} else {
844 			delta /= 2;
845 		}
846 		delta += delta / numPoints;
847 		ulong k = 0;
848 		while (delta > ((base - tmin) * tmax) / 2) {
849 			delta /= (base - tmin);
850 			k += base;
851 		}
852 		return k + (((base - tmin + 1) * delta) / (delta + skew));
853 	}
854 }
855 
856 /**
857 	* Encode the input string using the Punycode algorithm.
858 	*
859 	* Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked
860 	* with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com"
861 	* in Punycode, you will get "xn--m3h.xn--n3h.com".
862 	*
863 	* In order to puny-encode a domain name, you must split it into its components. The following will
864 	* typically suffice:
865 	* ---
866 	* auto domain = "☂.☃.com";
867 	* auto encodedDomain = domain.splitter(".").map!(punyEncode).join(".");
868 	* ---
869 	*/
870 string punyEncode(string input) {
871 	ulong delta = 0;
872 	dchar n = initialN;
873 	auto i = 0;
874 	auto bias = initialBias;
875 	Appender!string output;
876 	output ~= marker;
877 	auto pushed = 0;
878 	auto codePoints = 0;
879 	foreach (dchar c; input) {
880 		codePoints++;
881 		if (c <= initialN) {
882 			output ~= c;
883 			pushed++;
884 		}
885 	}
886 	if (pushed < codePoints) {
887 		if (pushed > 0) {
888 			output ~= delimiter;
889 		}
890 	} else {
891 		// No encoding to do.
892 		return input;
893 	}
894 	bool first = true;
895 	while (pushed < codePoints) {
896 		auto best = dchar.max;
897 		foreach (dchar c; input) {
898 			if (n <= c && c < best) {
899 				best = c;
900 			}
901 		}
902 		if (best == dchar.max) {
903 			throw new URLException("failed to find a new codepoint to process during punyencode");
904 		}
905 		delta += (best - n) * (pushed + 1);
906 		if (delta > uint.max) {
907 			// TODO better error message
908 			throw new URLException("overflow during punyencode");
909 		}
910 		n = best;
911 		foreach (dchar c; input) {
912 			if (c < n) {
913 				delta++;
914 			}
915 			if (c == n) {
916 				ulong q = delta;
917 				auto k = base;
918 				while (true) {
919 					ulong t;
920 					if (k <= bias) {
921 						t = tmin;
922 					} else if (k >= bias + tmax) {
923 						t = tmax;
924 					} else {
925 						t = k - bias;
926 					}
927 					if (q < t) {
928 						break;
929 					}
930 					output ~= digitToBasic(t + ((q - t) % (base - t)));
931 					q = (q - t) / (base - t);
932 					k += base;
933 				}
934 				output ~= digitToBasic(q);
935 				pushed++;
936 				bias = adapt(delta, pushed, first);
937 				first = false;
938 				delta = 0;
939 			}
940 		}
941 		delta++;
942 		n++;
943 	}
944 	return cast(string)output.data;
945 }
946 
947 /**
948 	* Decode the input string using the Punycode algorithm.
949 	*
950 	* Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked
951 	* with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com"
952 	* in Punycode, you will get "xn--m3h.xn--n3h.com".
953 	*
954 	* In order to puny-decode a domain name, you must split it into its components. The following will
955 	* typically suffice:
956 	* ---
957 	* auto domain = "xn--m3h.xn--n3h.com";
958 	* auto decodedDomain = domain.splitter(".").map!(punyDecode).join(".");
959 	* ---
960 	*/
961 string punyDecode(string input) {
962 	if (!input.startsWith(marker)) {
963 		return input;
964 	}
965 	input = input[marker.length..$];
966 
967  	// let n = initial_n
968 	dchar n = cast(dchar)128;
969 
970  	// let i = 0
971  	// let bias = initial_bias
972  	// let output = an empty string indexed from 0
973 	ulong i = 0;
974 	auto bias = initialBias;
975 	dchar[] output;
976 	// This reserves a bit more than necessary, but it should be more efficient overall than just
977 	// appending and inserting volo-nolo.
978 	output.reserve(input.length);
979 
980  	// consume all code points before the last delimiter (if there is one)
981  	//   and copy them to output, fail on any non-basic code point
982  	// if more than zero code points were consumed then consume one more
983  	//   (which will be the last delimiter)
984 	auto end = input.lastIndexOf(delimiter);
985 	if (end > -1) {
986 		foreach (dchar c; input[0..end]) {
987 			output ~= c;
988 		}
989 		input = input[end+1 .. $];
990 	}
991 
992  	// while the input is not exhausted do begin
993 	ulong pos = 0;
994 	while (pos < input.length) {
995  	//   let oldi = i
996  	//   let w = 1
997 		auto oldi = i;
998 		auto w = 1;
999  	//   for k = base to infinity in steps of base do begin
1000 		for (ulong k = base; k < uint.max; k += base) {
1001  	//     consume a code point, or fail if there was none to consume
1002 			// Note that the input is all ASCII, so we can simply index the input string bytewise.
1003 			auto c = input[pos];
1004 			pos++;
1005  	//     let digit = the code point's digit-value, fail if it has none
1006 			auto digit = basicToDigit(c);
1007  	//     let i = i + digit * w, fail on overflow
1008 			i += digit * w;
1009  	//     let t = tmin if k <= bias {+ tmin}, or
1010  	//             tmax if k >= bias + tmax, or k - bias otherwise
1011 			ulong t;
1012 			if (k <= bias) {
1013 				t = tmin;
1014 			} else if (k >= bias + tmax) {
1015 				t = tmax;
1016 			} else {
1017 				t = k - bias;
1018 			}
1019  	//     if digit < t then break
1020 			if (digit < t) {
1021 				break;
1022 			}
1023  	//     let w = w * (base - t), fail on overflow
1024 			w *= (base - t);
1025  	//   end
1026 		}
1027  	//   let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
1028 		bias = adapt(i - oldi, output.length + 1, oldi == 0);
1029  	//   let n = n + i div (length(output) + 1), fail on overflow
1030 		n += i / (output.length + 1);
1031  	//   let i = i mod (length(output) + 1)
1032 		i %= (output.length + 1);
1033  	//   {if n is a basic code point then fail}
1034 		// (We aren't actually going to fail here; it's clear what this means.)
1035  	//   insert n into output at position i
1036 		(() @trusted { output.insertInPlace(i, cast(dchar)n); })();  // should be @safe but isn't marked
1037  	//   increment i
1038 		i++;
1039  	// end
1040 	}
1041 	return output.to!string;
1042 }
1043 
1044 // Lifted from punycode.js.
1045 private dchar digitToBasic(ulong digit) {
1046 	return cast(dchar)(digit + 22 + 75 * (digit < 26));
1047 }
1048 
1049 // Lifted from punycode.js.
1050 private uint basicToDigit(char c) {
1051 	auto codePoint = cast(uint)c;
1052 	if (codePoint - 48 < 10) {
1053 		return codePoint - 22;
1054 	}
1055 	if (codePoint - 65 < 26) {
1056 		return codePoint - 65;
1057 	}
1058 	if (codePoint - 97 < 26) {
1059 		return codePoint - 97;
1060 	}
1061 	return base;
1062 }
1063 
1064 unittest {
1065 	{
1066 		auto a = "b\u00FCcher";
1067 		assert(punyEncode(a) == "xn--bcher-kva");
1068 	}
1069 	{
1070 		auto a = "b\u00FCc\u00FCher";
1071 		assert(punyEncode(a) == "xn--bcher-kvab");
1072 	}
1073 	{
1074 		auto a = "ýbücher";
1075 		auto b = punyEncode(a);
1076 		assert(b == "xn--bcher-kvaf", b);
1077 	}
1078 
1079 	{
1080 		auto a = "mañana";
1081 		assert(punyEncode(a) == "xn--maana-pta");
1082 	}
1083 
1084 	{
1085 		auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1086 			~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F";
1087 		auto b = punyEncode(a);
1088 		assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b);
1089 	}
1090 	import std.stdio;
1091 }
1092 
1093 unittest {
1094 	{
1095 		auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn");
1096 		assert(b == "ليهمابتكلموشعربي؟", b);
1097 	}
1098 	{
1099 		assert(punyDecode("xn--maana-pta") == "mañana");
1100 	}
1101 }
1102 
1103 unittest {
1104 	import std..string, std.algorithm, std.array, std.range;
1105 	{
1106 		auto domain = "xn--m3h.xn--n3h.com";
1107 		auto decodedDomain = domain.splitter(".").map!(punyDecode).join(".");
1108 		assert(decodedDomain == "☂.☃.com", decodedDomain);
1109 	}
1110 	{
1111 		auto domain = "☂.☃.com";
1112 		auto decodedDomain = domain.splitter(".").map!(punyEncode).join(".");
1113 		assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain);
1114 	}
1115 }