1 /**
2 	* A URL handling library.
3 	*
4 	* URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional
5 	* elements like port, path, username, and password.
6 	*
7 	* This module aims to make it simple to muck about with them.
8 	*
9 	* Example usage:
10 	* ---
11 	* auto url = "ssh://me:password@192.168.0.8/".parseURL;
12 	* auto files = system("ssh", url.toString, "ls").splitLines;
13 	* foreach (file; files) {
14 	*		auto fileURL = url;
15 	*		fileURL.path = file;
16 	*		system("scp", fileURL.toString, ".");
17 	* }
18 	* ---
19 	*/
20 module url;
21 
22 import std.algorithm;
23 import std.array;
24 import std.conv;
25 import std.encoding;
26 import std.string;
27 import std.utf;
28 
29 /// An exception thrown when something bad happens with URLs.
30 class URLException : Exception {
31 	this(string msg) { super(msg); }
32 }
33 
34 /**
35 	* A mapping from schemes to their default ports.
36 	*
37   * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to
38 	* use even if they use ports. Entries here should be treated as best guesses.
39   */
40 ushort[string] schemeToDefaultPort;
41 
42 static this() {
43 	schemeToDefaultPort = [
44 		"aaa": 3868,
45 		"aaas": 5658,
46 		"acap": 674,
47 		"cap": 1026,
48 		"coap": 5683,
49 		"coaps": 5684,
50 		"dav": 443,
51 		"dict": 2628,
52 		"ftp": 21,
53 		"git": 9418,
54 		"go": 1096,
55 		"gopher": 70,
56 		"http": 80,
57 		"https": 443,
58 		"iac": 4569,
59 		"icap": 1344,
60 		"imap": 143,
61 		"ipp": 631,
62 		"ipps": 631,  // yes, they're both mapped to port 631
63 		"irc": 6667,  // De facto default port, not the IANA reserved port.
64 		"ircs": 6697,
65 		"iris": 702,  // defaults to iris.beep
66 		"iris.beep": 702,
67 		"iris.lwz": 715,
68 		"iris.xpc": 713,
69 		"iris.xpcs": 714,
70 		"jabber": 5222,  // client-to-server
71 		"ldap": 389,
72 		"ldaps": 636,
73 		"msrp": 2855,
74 		"msrps": 2855,
75 		"mtqp": 1038,
76 		"mupdate": 3905,
77 		"news": 119,
78 		"nfs": 2049,
79 		"pop": 110,
80 		"redis": 6379,
81 		"reload": 6084,
82 		"rsync": 873,
83 		"rtmfp": 1935,
84 		"rtsp": 554,
85 		"shttp": 80,
86 		"sieve": 4190,
87 		"sip": 5060,
88 		"sips": 5061,
89 		"smb": 445,
90 		"smtp": 25,
91 		"snews": 563,
92 		"snmp": 161,
93 		"soap.beep": 605,
94 		"ssh": 22,
95 		"stun": 3478,
96 		"stuns": 5349,
97 		"svn": 3690,
98 		"teamspeak": 9987,
99 		"telnet": 23,
100 		"tftp": 69,
101 		"tip": 3372,
102 	];
103 }
104 
105 /**
106 	* A Unique Resource Locator.
107 	*
108 	* The syntax for URLs is scheme:[//[user:password@]host[:port]][/]path[?query][#fragment].
109 	* 
110 	*/
111 struct URL {
112 	/// The URL scheme. For instance, ssh, ftp, or https.
113 	string scheme;
114 
115 	/// The username in this URL. Usually absent. If present, there will also be a password.
116 	string user;
117 
118 	/// The password in this URL. Usually absent.
119 	string pass;
120 
121 	/// The hostname.
122 	string host;
123 
124 	/// The port.
125 	/// This is inferred from the scheme if it isn't present in the URL itself.
126 	/// If the scheme is not known and the port is not present, the port will be given as 0.
127 	/// For some schemes, port will not be sensible -- for instance, file or chrome-extension.
128 	///
129 	/// If you explicitly need to detect whether the user provided a port, check the providedPort
130 	/// field.
131 	@property ushort port() {
132 		if (providedPort != 0) {
133 			return providedPort;
134 		}
135 		if (auto p = scheme in schemeToDefaultPort) {
136 			return *p;
137 		}
138 		return 0;
139 	}
140 
141 	/// Set the port.
142 	/// This is a shortcut for convenience because you probably don't care about the difference
143 	/// between port and providedPort.
144 	@property ushort port(ushort value) {
145 		return providedPort = value;
146 	}
147 
148 	/// The port that was explicitly provided in the URL.
149 	/// 
150 	ushort providedPort;
151 
152 	/// The path. This excludes the query string.
153 	/// For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is
154 	/// "/news/story/17774".
155 	string path;
156 
157 	/// The query string elements.
158 	/// For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string
159 	/// elements will be ["visited": "false"].
160 	/// Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be
161 	/// ["item": ""].
162 	string[string] query;
163 
164 	/// The fragment. In web documents, this typically refers to an anchor element.
165 	/// For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2".
166 	string fragment;
167 
168 	/// Convert this URL to a string.
169 	/// The string is properly formatted and usable for, eg, a web request.
170 	string toString() {
171 		Appender!string s;
172 		s ~= scheme;
173 		s ~= "://";
174 		if (user) {
175 			s ~= user.percentEncode;
176 			s ~= ":";
177 			s ~= pass.percentEncode;
178 			s ~= "@";
179 		}
180 		s ~= host;
181 		if (providedPort) {
182 			s ~= ":";
183 			s ~= providedPort.to!string;
184 		}
185 		string p = path;
186 		if (!p) {
187 			s ~= '/';
188 		} else {
189 			if (p[0] == '/') {
190 				p = p[1..$];
191 			}
192 			foreach (part; p.split('/')) {
193 				s ~= '/';
194 				s ~= part.percentEncode;
195 			}
196 		}
197 		if (query) {
198 			s ~= '?';
199 			bool first = true;
200 			foreach (k, v; query) {
201 				if (!first) {
202 					s ~= '&';
203 				}
204 				first = false;
205 				s ~= k.percentEncode;
206 				if (v) {
207 					s ~= '=';
208 					s ~= v.percentEncode;
209 				}
210 			}
211 		}
212 		if (fragment) {
213 			s ~= '#';
214 			s ~= fragment.percentEncode;
215 		}
216 		return s.data;
217 	}
218 }
219 
220 /**
221 	* Parse a URL from a string.
222 	*
223 	* This attempts to parse a wide range of URLs as people might actually type them. Some mistakes
224 	* may be made. However, any URL in a correct format will be parsed correctly.
225 	*
226 	* Punycode is not supported.
227 	*/
228 bool tryParseURL(string value, out URL url) {
229 	url = URL.init;
230 	// scheme:[//[user:password@]host[:port]][/]path[?query][#fragment]
231 	// Scheme is optional in common use. We infer 'http' if it's not given.
232 	auto i = value.indexOf("://");
233 	if (i > -1) {
234 		url.scheme = value[0..i];
235 		value = value[i+3 .. $];
236 	} else {
237 		url.scheme = "http";
238 	}
239 	// [user:password@]host[:port]][/]path[?query][#fragment
240 	i = value.indexOfAny([':', '/']);
241 	if (i == -1) {
242 		// Just a hostname.
243 		url.host = value;
244 		return true;
245 	}
246 
247 	if (value[i] == ':') {
248 		// This could be between username and password, or it could be between host and port.
249 		auto j = value.indexOfAny(['@', '/']);
250 		if (j > -1 && value[j] == '@') {
251 			try {
252 				url.user = value[0..i].percentDecode;
253 				url.pass = value[i+1 .. j].percentDecode;
254 			} catch (URLException) {
255 				return false;
256 			}
257 			value = value[j+1 .. $];
258 		}
259 	}
260 
261 	// It's trying to be a host/port, not a user/pass.
262 	i = value.indexOfAny([':', '/']);
263 	if (i == -1) {
264 		url.host = value;
265 		return true;
266 	}
267 	url.host = value[0..i];
268 	value = value[i .. $];
269 	if (value[0] == ':') {
270 		auto end = value.indexOf('/');
271 		if (end == -1) {
272 			end = value.length;
273 		}
274 		try {
275 			url.port = value[1 .. end].to!ushort;
276 		} catch (ConvException) {
277 			return false;
278 		}
279 		value = value[end .. $];
280 		if (!value) {
281 			return true;
282 		}
283 	}
284 
285 	i = value.indexOfAny("?#");
286 	if (i == -1) {
287 		url.path = value;
288 		return true;
289 	}
290 
291 	try {
292 		url.path = value[0..i].percentDecode;
293 	} catch (URLException) {
294 		return false;
295 	}
296 	auto c = value[i];
297 	value = value[i + 1 .. $];
298 	if (c == '?') {
299 		i = value.indexOf('#');
300 		string query;
301 		if (i < 0) {
302 			query = value;
303 			value = null;
304 		} else {
305 			query = value[0..i];
306 			value = value[i + 1 .. $];
307 		}
308 		auto queries = query.split('&');
309 		foreach (q; queries) {
310 			auto j = q.indexOf('=');
311 			try {
312 				if (j == -1) {
313 					url.query[q.percentDecode] = "";
314 				} else {
315 					url.query[q[0..j].percentDecode] = q[j + 1 .. $].percentDecode;
316 				}
317 			} catch (URLException) {
318 				return false;
319 			}
320 		}
321 	}
322 
323 	try {
324 		url.fragment = value.percentDecode;
325 	} catch (URLException) {
326 		return false;
327 	}
328 
329 	return true;
330 }
331 
332 ///
333 unittest {
334 	{
335 		// Basic.
336 		URL url;
337 		with (url) {
338 			scheme = "https";
339 			host = "example.org";
340 			path = "/foo/bar";
341 			query["hello"] = "world";
342 			query["gibe"] = "clay";
343 			fragment = "frag";
344 		}
345 		assert(
346 				// Not sure what order it'll come out in.
347 				url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" ||
348 				url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag",
349 				url.toString);
350 	}
351 	{
352 		// Percent encoded.
353 		URL url;
354 		with (url) {
355 			scheme = "https";
356 			host = "example.org";
357 			path = "/f☃o";
358 			query["❄"] = "❀";
359 			query["["] = "]";
360 			fragment = "ş";
361 		}
362 		assert(
363 				// Not sure what order it'll come out in.
364 				url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" ||
365 				url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F",
366 				url.toString);
367 	}
368 	{
369 		// Port, user, pass.
370 		URL url;
371 		with (url) {
372 			scheme = "https";
373 			host = "example.org";
374 			user = "dhasenan";
375 			pass = "itsasecret";
376 			port = 17;
377 		}
378 		assert(
379 				url.toString == "https://dhasenan:itsasecret@example.org:17/",
380 				url.toString);
381 	}
382 	{
383 		// Query with no path.
384 		URL url;
385 		with (url) {
386 			scheme = "https";
387 			host = "example.org";
388 			query["hi"] = "bye";
389 		}
390 		assert(
391 				url.toString == "https://example.org/?hi=bye",
392 				url.toString);
393 	}
394 }
395 
396 /**
397 	* Parse the input string as a URL.
398 	*
399 	* Throws:
400 	*   URLException if the string was in an incorrect format.
401 	*/
402 URL parseURL(string value) {
403 	URL url;
404 	if (tryParseURL(value, url)) {
405 		return url;
406 	}
407 	throw new URLException("failed to parse URL " ~ value);
408 }
409 
410 ///
411 unittest {
412 	{
413 		// Infer scheme
414 		auto u1 = parseURL("example.org");
415 		assert(u1.scheme == "http");
416 		assert(u1.host == "example.org");
417 		assert(u1.path == "");
418 		assert(u1.port == 80);
419 		assert(u1.providedPort == 0);
420 		assert(u1.fragment == "");
421 	}
422 	{
423 		// Simple host and scheme
424 		auto u1 = parseURL("https://example.org");
425 		assert(u1.scheme == "https");
426 		assert(u1.host == "example.org");
427 		assert(u1.path == "");
428 		assert(u1.port == 443);
429 		assert(u1.providedPort == 0);
430 	}
431 	{
432 		// With path
433 		auto u1 = parseURL("https://example.org/foo/bar");
434 		assert(u1.scheme == "https");
435 		assert(u1.host == "example.org");
436 		assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path);
437 		assert(u1.port == 443);
438 		assert(u1.providedPort == 0);
439 	}
440 	{
441 		// With explicit port
442 		auto u1 = parseURL("https://example.org:1021/foo/bar");
443 		assert(u1.scheme == "https");
444 		assert(u1.host == "example.org");
445 		assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path);
446 		assert(u1.port == 1021);
447 		assert(u1.providedPort == 1021);
448 	}
449 	{
450 		// With user
451 		auto u1 = parseURL("https://bob:secret@example.org/foo/bar");
452 		assert(u1.scheme == "https");
453 		assert(u1.host == "example.org");
454 		assert(u1.path == "/foo/bar");
455 		assert(u1.port == 443);
456 		assert(u1.user == "bob");
457 		assert(u1.pass == "secret");
458 	}
459 	{
460 		// With user, URL-encoded
461 		auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar");
462 		assert(u1.scheme == "https");
463 		assert(u1.host == "example.org");
464 		assert(u1.path == "/foo/bar");
465 		assert(u1.port == 443);
466 		assert(u1.user == "bob!");
467 		assert(u1.pass == "secret!?");
468 	}
469 	{
470 		// With user and port and path
471 		auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar");
472 		assert(u1.scheme == "https");
473 		assert(u1.host == "example.org");
474 		assert(u1.path == "/foo/bar");
475 		assert(u1.port == 2210);
476 		assert(u1.user == "bob");
477 		assert(u1.pass == "secret");
478 		assert(u1.fragment == "");
479 	}
480 	{
481 		// With query string
482 		auto u1 = parseURL("https://example.org/?login=true");
483 		assert(u1.scheme == "https");
484 		assert(u1.host == "example.org");
485 		assert(u1.path == "/", "expected path: / actual path: " ~ u1.path);
486 		assert(u1.query["login"] == "true");
487 		assert(u1.fragment == "");
488 	}
489 	{
490 		// With query string and fragment
491 		auto u1 = parseURL("https://example.org/?login=true#justkidding");
492 		assert(u1.scheme == "https");
493 		assert(u1.host == "example.org");
494 		assert(u1.path == "/", "expected path: / actual path: " ~ u1.path);
495 		assert(u1.query["login"] == "true");
496 		assert(u1.fragment == "justkidding");
497 	}
498 	{
499 		// With URL-encoded values
500 		auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E");
501 		assert(u1.scheme == "https");
502 		assert(u1.host == "example.org");
503 		assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path);
504 		assert(u1.query["❄"] == "=");
505 		assert(u1.fragment == "^");
506 	}
507 }
508 
509 /**
510 	* Percent-encode a string.
511 	*
512 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
513 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
514 	* everything else, there is percent encoding.
515 	*/
516 string percentEncode(string raw) {
517 	// We *must* encode these characters: :/?#[]@!$&'()*+,;="
518 	// We *can* encode any other characters.
519 	// We *should not* encode alpha, numeric, or -._~.
520 	Appender!string app;
521 	foreach (dchar d; raw) {
522 		if (('a' <= d && 'z' >= d) ||
523 				('A' <= d && 'Z' >= d) ||
524 				('0' <= d && '9' >= d) ||
525 				d == '-' || d == '.' || d == '_' || d == '~') {
526 			app ~= d;
527 			continue;
528 		}
529 		// Something simple like a space character? Still in 7-bit ASCII?
530 		// Then we get a single-character string out of it and just encode
531 		// that one bit.
532 		// Something not in 7-bit ASCII? Then we percent-encode each octet
533 		// in the UTF-8 encoding (and hope the server understands UTF-8).
534 		char[] c;
535 		encode(c, d);
536 		auto bytes = cast(ubyte[])c;
537 		foreach (b; bytes) {
538 			app ~= format("%%%02X", b);
539 		}
540 	}
541 	return cast(string)app.data;
542 }
543 
544 ///
545 unittest {
546 	assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding");
547 	assert(percentEncode("~~--..__") == "~~--..__");
548 	assert(percentEncode("0123456789") == "0123456789");
549 
550 	string e;
551 
552 	e = percentEncode("☃");
553 	assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e);
554 }
555 
556 /**
557 	* Percent-decode a string.
558 	*
559 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
560 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
561 	* everything else, there is percent encoding.
562 	*
563 	* This explicitly ensures that the result is a valid UTF-8 string.
564 	*/
565 string percentDecode(string encoded) {
566 	ubyte[] raw = percentDecodeRaw(encoded);
567 	auto s = cast(string) raw;
568 	if (!s.isValid) {
569 		// TODO(dhasenan): 
570 		throw new URLException("input contains invalid UTF data");
571 	}
572 	return s;
573 }
574 
575 ///
576 unittest {
577 	assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding");
578 	assert(percentDecode("~~--..__") == "~~--..__");
579 	assert(percentDecode("0123456789") == "0123456789");
580 
581 	string e;
582 
583 	e = percentDecode("%E2%98%83");
584 	assert(e == "☃", "expected a snowman but got" ~ e);
585 }
586 
587 /**
588 	* Percent-decode a string into a ubyte array.
589 	*
590 	* URL components cannot contain non-ASCII characters, and there are very few characters that are
591 	* safe to include as URL components. Domain names using Unicode values use Punycode. For
592 	* everything else, there is percent encoding.
593 	*
594 	* This yields a ubyte array and will not perform validation on the output. However, an improperly
595 	* formatted input string will result in a URLException.
596 	*/
597 ubyte[] percentDecodeRaw(string encoded) {
598 	// We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now.
599 	Appender!(ubyte[]) app;
600 	for (int i = 0; i < encoded.length; i++) {
601 		if (encoded[i] != '%') {
602 			app ~= encoded[i];
603 			continue;
604 		}
605 		if (i >= encoded.length - 2) {
606 			throw new URLException("Invalid percent encoded value: expected two characters after " ~
607 					"percent symbol. Error at index " ~ i.to!string);
608 		}
609 		auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1]));
610 		auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2]));
611 		app ~= cast(ubyte)((b << 4) | c);
612 		i += 2;
613 	}
614 	return app.data;
615 }
616 
617 /++
618 string toAscii(string unicodeHostname) {
619 	bool mustEncode = false;
620 	foreach (i, dchar d; unicodeHostname) {
621 		auto c = cast(uint) d;
622 		if (c > 0x80) {
623 			mustEncode = true;
624 			break;
625 		}
626 		if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) {
627 			throw new URLException(
628 					format(
629 						"domain name '%s' contains illegal character '%s' at position %s",
630 						unicodeHostname, d, i));
631 		}
632 	}
633 	if (!mustEncode) {
634 		return unicodeHostname;
635 	}
636 	auto parts = unicodeHostname.split('.');
637 	char[] result;
638 	foreach (part; parts) {
639 		result ~= punyEncode(part);
640 	}
641 	return cast(string)result;
642 }
643 
644 string punyEncode(string item, string delimiter = null, string marker = null) {
645 	// Puny state machine initial variables.
646 	auto base = 36;
647 	auto tmin = 1;
648 	auto tmax = 26;
649 	auto skew = 38;
650 	auto damp = 700;
651 	auto initialBias = 72;
652 	long b = 0;
653 
654 	bool needToEncode = false;
655 	Appender!(char[]) app;
656 	app ~= marker;
657 	foreach (dchar d; item) {
658 		if (d > '~') {  // Max printable ASCII. The DEL char isn't allowed in hostnames.
659 			needToEncode = true;
660 		} else {
661 			app ~= d;
662 			b++;
663 		}
664 	}
665 	if (!needToEncode) {
666 		return item;
667 	}
668 	app ~= delimiter;
669 
670 	// The puny algorithm.
671 	// We use 64-bit arithmetic to avoid overflow issues -- unicode only defines up to 0x10FFFF,
672 	// and we won't be encoding gigabytes of data, but just to be safe.
673 	// Also we use signed values just to make things easier.
674 	long delta = 0;
675 	long bias = initialBias;
676 	long h = b;
677 	long lastIndex = 0;
678 
679 	dchar digitToBasic(ulong digit) {
680 		if (digit < 26) {
681 			return 'a' + cast(dchar)digit;
682 		}
683 		return cast(dchar)('0' + (digit - 26));
684 	}
685 
686 	ulong adapt(ulong delta, ulong numPoints, bool firstTime) {
687 		auto k = 0;
688 		delta = firstTime ? (delta / damp) : delta >> 1;
689 		delta += (delta / numPoints);
690 		for (; delta > (base - tmin) * tmax >> 1; k += base) {
691 			delta = (delta / (base - tmin));
692 		}
693 		return k + (base - tmin + 1) * delta / (delta + skew);
694 	}
695 
696 	auto f = filter!(x => x >= cast(dchar)128)(item).array;
697 	auto uniqueChars = uniq(std.algorithm.sorting.sort(f));
698 	foreach (dchar n; uniqueChars) {
699 		foreach (dchar c; item) {
700 			if (c < n) {
701 				delta++;
702 			} else if (c == n) {
703 				auto q = delta;
704 				for (ulong k = 0; k < cast(ulong)uint.max; k += base) {
705 					auto t = k <= bias ? tmin : (k >= bias + tmax ? tmax : k - bias);
706 					if (q < t) {
707 						break;
708 					}
709 					app ~= digitToBasic(t + ((q - t) % (base - t)));
710 					q = (q - t) / (base - t);
711 				}
712 				app ~= digitToBasic(q);
713 				bias = adapt(delta, h + 1, h == b);
714 				h++;
715 			}
716 		}
717 		delta++;
718 	}
719 	return cast(string)app.data;
720 }
721 
722 unittest {
723 	import std.stdio;
724 	auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
725 		~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F";
726 	writeln(a);
727 	writeln(punyEncode(a));
728 	assert(punyEncode(a) == "egbpdaj6bu4bxfgehfvwxn");
729 }
730 
731 struct URL {
732 	Host host;
733 }
734 ++/