1 /** 2 * A URL handling library. 3 * 4 * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 * elements like port, path, username, and password. 6 * 7 * This module aims to make it simple to muck about with them. 8 * 9 * Example usage: 10 * --- 11 * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 * auto files = system("ssh", url.toString, "ls").splitLines; 13 * foreach (file; files) { 14 * auto fileURL = url; 15 * fileURL.path = file; 16 * system("scp", fileURL.toString, "."); 17 * } 18 * --- 19 */ 20 module url; 21 22 import std.algorithm; 23 import std.array; 24 import std.conv; 25 import std.encoding; 26 import std.string; 27 import std.utf; 28 29 /// An exception thrown when something bad happens with URLs. 30 class URLException : Exception { 31 this(string msg) { super(msg); } 32 } 33 34 /** 35 * A mapping from schemes to their default ports. 36 * 37 * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 38 * use even if they use ports. Entries here should be treated as best guesses. 39 */ 40 ushort[string] schemeToDefaultPort; 41 42 static this() { 43 schemeToDefaultPort = [ 44 "aaa": 3868, 45 "aaas": 5658, 46 "acap": 674, 47 "cap": 1026, 48 "coap": 5683, 49 "coaps": 5684, 50 "dav": 443, 51 "dict": 2628, 52 "ftp": 21, 53 "git": 9418, 54 "go": 1096, 55 "gopher": 70, 56 "http": 80, 57 "https": 443, 58 "iac": 4569, 59 "icap": 1344, 60 "imap": 143, 61 "ipp": 631, 62 "ipps": 631, // yes, they're both mapped to port 631 63 "irc": 6667, // De facto default port, not the IANA reserved port. 64 "ircs": 6697, 65 "iris": 702, // defaults to iris.beep 66 "iris.beep": 702, 67 "iris.lwz": 715, 68 "iris.xpc": 713, 69 "iris.xpcs": 714, 70 "jabber": 5222, // client-to-server 71 "ldap": 389, 72 "ldaps": 636, 73 "msrp": 2855, 74 "msrps": 2855, 75 "mtqp": 1038, 76 "mupdate": 3905, 77 "news": 119, 78 "nfs": 2049, 79 "pop": 110, 80 "redis": 6379, 81 "reload": 6084, 82 "rsync": 873, 83 "rtmfp": 1935, 84 "rtsp": 554, 85 "shttp": 80, 86 "sieve": 4190, 87 "sip": 5060, 88 "sips": 5061, 89 "smb": 445, 90 "smtp": 25, 91 "snews": 563, 92 "snmp": 161, 93 "soap.beep": 605, 94 "ssh": 22, 95 "stun": 3478, 96 "stuns": 5349, 97 "svn": 3690, 98 "teamspeak": 9987, 99 "telnet": 23, 100 "tftp": 69, 101 "tip": 3372, 102 ]; 103 } 104 105 /** 106 * A Unique Resource Locator. 107 * 108 * The syntax for URLs is scheme:[//[user:password@]host[:port]][/]path[?query][#fragment]. 109 * 110 */ 111 struct URL { 112 /// The URL scheme. For instance, ssh, ftp, or https. 113 string scheme; 114 115 /// The username in this URL. Usually absent. If present, there will also be a password. 116 string user; 117 118 /// The password in this URL. Usually absent. 119 string pass; 120 121 /// The hostname. 122 string host; 123 124 /// The port. 125 /// This is inferred from the scheme if it isn't present in the URL itself. 126 /// If the scheme is not known and the port is not present, the port will be given as 0. 127 /// For some schemes, port will not be sensible -- for instance, file or chrome-extension. 128 /// 129 /// If you explicitly need to detect whether the user provided a port, check the providedPort 130 /// field. 131 @property ushort port() { 132 if (providedPort != 0) { 133 return providedPort; 134 } 135 if (auto p = scheme in schemeToDefaultPort) { 136 return *p; 137 } 138 return 0; 139 } 140 141 /// Set the port. 142 /// This is a shortcut for convenience because you probably don't care about the difference 143 /// between port and providedPort. 144 @property ushort port(ushort value) { 145 return providedPort = value; 146 } 147 148 /// The port that was explicitly provided in the URL. 149 /// 150 ushort providedPort; 151 152 /// The path. This excludes the query string. 153 /// For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 154 /// "/news/story/17774". 155 string path; 156 157 /// The query string elements. 158 /// For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string 159 /// elements will be ["visited": "false"]. 160 /// Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be 161 /// ["item": ""]. 162 string[string] query; 163 164 /// The fragment. In web documents, this typically refers to an anchor element. 165 /// For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 166 string fragment; 167 168 /// Convert this URL to a string. 169 /// The string is properly formatted and usable for, eg, a web request. 170 string toString() { 171 Appender!string s; 172 s ~= scheme; 173 s ~= "://"; 174 if (user) { 175 s ~= user.percentEncode; 176 s ~= ":"; 177 s ~= pass.percentEncode; 178 s ~= "@"; 179 } 180 s ~= host; 181 if (providedPort) { 182 s ~= ":"; 183 s ~= providedPort.to!string; 184 } 185 string p = path; 186 if (!p) { 187 s ~= '/'; 188 } else { 189 if (p[0] == '/') { 190 p = p[1..$]; 191 } 192 foreach (part; p.split('/')) { 193 s ~= '/'; 194 s ~= part.percentEncode; 195 } 196 } 197 if (query) { 198 s ~= '?'; 199 bool first = true; 200 foreach (k, v; query) { 201 if (!first) { 202 s ~= '&'; 203 } 204 first = false; 205 s ~= k.percentEncode; 206 if (v) { 207 s ~= '='; 208 s ~= v.percentEncode; 209 } 210 } 211 } 212 if (fragment) { 213 s ~= '#'; 214 s ~= fragment.percentEncode; 215 } 216 return s.data; 217 } 218 } 219 220 /** 221 * Parse a URL from a string. 222 * 223 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 224 * may be made. However, any URL in a correct format will be parsed correctly. 225 * 226 * Punycode is not supported. 227 */ 228 bool tryParseURL(string value, out URL url) { 229 url = URL.init; 230 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 231 // Scheme is optional in common use. We infer 'http' if it's not given. 232 auto i = value.indexOf("://"); 233 if (i > -1) { 234 url.scheme = value[0..i]; 235 value = value[i+3 .. $]; 236 } else { 237 url.scheme = "http"; 238 } 239 // [user:password@]host[:port]][/]path[?query][#fragment 240 i = value.indexOfAny([':', '/']); 241 if (i == -1) { 242 // Just a hostname. 243 url.host = value; 244 return true; 245 } 246 247 if (value[i] == ':') { 248 // This could be between username and password, or it could be between host and port. 249 auto j = value.indexOfAny(['@', '/']); 250 if (j > -1 && value[j] == '@') { 251 try { 252 url.user = value[0..i].percentDecode; 253 url.pass = value[i+1 .. j].percentDecode; 254 } catch (URLException) { 255 return false; 256 } 257 value = value[j+1 .. $]; 258 } 259 } 260 261 // It's trying to be a host/port, not a user/pass. 262 i = value.indexOfAny([':', '/']); 263 if (i == -1) { 264 url.host = value; 265 return true; 266 } 267 url.host = value[0..i]; 268 value = value[i .. $]; 269 if (value[0] == ':') { 270 auto end = value.indexOf('/'); 271 if (end == -1) { 272 end = value.length; 273 } 274 try { 275 url.port = value[1 .. end].to!ushort; 276 } catch (ConvException) { 277 return false; 278 } 279 value = value[end .. $]; 280 if (!value) { 281 return true; 282 } 283 } 284 285 i = value.indexOfAny("?#"); 286 if (i == -1) { 287 url.path = value; 288 return true; 289 } 290 291 try { 292 url.path = value[0..i].percentDecode; 293 } catch (URLException) { 294 return false; 295 } 296 auto c = value[i]; 297 value = value[i + 1 .. $]; 298 if (c == '?') { 299 i = value.indexOf('#'); 300 string query; 301 if (i < 0) { 302 query = value; 303 value = null; 304 } else { 305 query = value[0..i]; 306 value = value[i + 1 .. $]; 307 } 308 auto queries = query.split('&'); 309 foreach (q; queries) { 310 auto j = q.indexOf('='); 311 try { 312 if (j == -1) { 313 url.query[q.percentDecode] = ""; 314 } else { 315 url.query[q[0..j].percentDecode] = q[j + 1 .. $].percentDecode; 316 } 317 } catch (URLException) { 318 return false; 319 } 320 } 321 } 322 323 try { 324 url.fragment = value.percentDecode; 325 } catch (URLException) { 326 return false; 327 } 328 329 return true; 330 } 331 332 /// 333 unittest { 334 { 335 // Basic. 336 URL url; 337 with (url) { 338 scheme = "https"; 339 host = "example.org"; 340 path = "/foo/bar"; 341 query["hello"] = "world"; 342 query["gibe"] = "clay"; 343 fragment = "frag"; 344 } 345 assert( 346 // Not sure what order it'll come out in. 347 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 348 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 349 url.toString); 350 } 351 { 352 // Percent encoded. 353 URL url; 354 with (url) { 355 scheme = "https"; 356 host = "example.org"; 357 path = "/f☃o"; 358 query["❄"] = "❀"; 359 query["["] = "]"; 360 fragment = "ş"; 361 } 362 assert( 363 // Not sure what order it'll come out in. 364 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 365 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 366 url.toString); 367 } 368 { 369 // Port, user, pass. 370 URL url; 371 with (url) { 372 scheme = "https"; 373 host = "example.org"; 374 user = "dhasenan"; 375 pass = "itsasecret"; 376 port = 17; 377 } 378 assert( 379 url.toString == "https://dhasenan:itsasecret@example.org:17/", 380 url.toString); 381 } 382 { 383 // Query with no path. 384 URL url; 385 with (url) { 386 scheme = "https"; 387 host = "example.org"; 388 query["hi"] = "bye"; 389 } 390 assert( 391 url.toString == "https://example.org/?hi=bye", 392 url.toString); 393 } 394 } 395 396 /** 397 * Parse the input string as a URL. 398 * 399 * Throws: 400 * URLException if the string was in an incorrect format. 401 */ 402 URL parseURL(string value) { 403 URL url; 404 if (tryParseURL(value, url)) { 405 return url; 406 } 407 throw new URLException("failed to parse URL " ~ value); 408 } 409 410 /// 411 unittest { 412 { 413 // Infer scheme 414 auto u1 = parseURL("example.org"); 415 assert(u1.scheme == "http"); 416 assert(u1.host == "example.org"); 417 assert(u1.path == ""); 418 assert(u1.port == 80); 419 assert(u1.providedPort == 0); 420 assert(u1.fragment == ""); 421 } 422 { 423 // Simple host and scheme 424 auto u1 = parseURL("https://example.org"); 425 assert(u1.scheme == "https"); 426 assert(u1.host == "example.org"); 427 assert(u1.path == ""); 428 assert(u1.port == 443); 429 assert(u1.providedPort == 0); 430 } 431 { 432 // With path 433 auto u1 = parseURL("https://example.org/foo/bar"); 434 assert(u1.scheme == "https"); 435 assert(u1.host == "example.org"); 436 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 437 assert(u1.port == 443); 438 assert(u1.providedPort == 0); 439 } 440 { 441 // With explicit port 442 auto u1 = parseURL("https://example.org:1021/foo/bar"); 443 assert(u1.scheme == "https"); 444 assert(u1.host == "example.org"); 445 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 446 assert(u1.port == 1021); 447 assert(u1.providedPort == 1021); 448 } 449 { 450 // With user 451 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 452 assert(u1.scheme == "https"); 453 assert(u1.host == "example.org"); 454 assert(u1.path == "/foo/bar"); 455 assert(u1.port == 443); 456 assert(u1.user == "bob"); 457 assert(u1.pass == "secret"); 458 } 459 { 460 // With user, URL-encoded 461 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 462 assert(u1.scheme == "https"); 463 assert(u1.host == "example.org"); 464 assert(u1.path == "/foo/bar"); 465 assert(u1.port == 443); 466 assert(u1.user == "bob!"); 467 assert(u1.pass == "secret!?"); 468 } 469 { 470 // With user and port and path 471 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 472 assert(u1.scheme == "https"); 473 assert(u1.host == "example.org"); 474 assert(u1.path == "/foo/bar"); 475 assert(u1.port == 2210); 476 assert(u1.user == "bob"); 477 assert(u1.pass == "secret"); 478 assert(u1.fragment == ""); 479 } 480 { 481 // With query string 482 auto u1 = parseURL("https://example.org/?login=true"); 483 assert(u1.scheme == "https"); 484 assert(u1.host == "example.org"); 485 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 486 assert(u1.query["login"] == "true"); 487 assert(u1.fragment == ""); 488 } 489 { 490 // With query string and fragment 491 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 492 assert(u1.scheme == "https"); 493 assert(u1.host == "example.org"); 494 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 495 assert(u1.query["login"] == "true"); 496 assert(u1.fragment == "justkidding"); 497 } 498 { 499 // With URL-encoded values 500 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 501 assert(u1.scheme == "https"); 502 assert(u1.host == "example.org"); 503 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 504 assert(u1.query["❄"] == "="); 505 assert(u1.fragment == "^"); 506 } 507 } 508 509 /** 510 * Percent-encode a string. 511 * 512 * URL components cannot contain non-ASCII characters, and there are very few characters that are 513 * safe to include as URL components. Domain names using Unicode values use Punycode. For 514 * everything else, there is percent encoding. 515 */ 516 string percentEncode(string raw) { 517 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 518 // We *can* encode any other characters. 519 // We *should not* encode alpha, numeric, or -._~. 520 Appender!string app; 521 foreach (dchar d; raw) { 522 if (('a' <= d && 'z' >= d) || 523 ('A' <= d && 'Z' >= d) || 524 ('0' <= d && '9' >= d) || 525 d == '-' || d == '.' || d == '_' || d == '~') { 526 app ~= d; 527 continue; 528 } 529 // Something simple like a space character? Still in 7-bit ASCII? 530 // Then we get a single-character string out of it and just encode 531 // that one bit. 532 // Something not in 7-bit ASCII? Then we percent-encode each octet 533 // in the UTF-8 encoding (and hope the server understands UTF-8). 534 char[] c; 535 encode(c, d); 536 auto bytes = cast(ubyte[])c; 537 foreach (b; bytes) { 538 app ~= format("%%%02X", b); 539 } 540 } 541 return cast(string)app.data; 542 } 543 544 /// 545 unittest { 546 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 547 assert(percentEncode("~~--..__") == "~~--..__"); 548 assert(percentEncode("0123456789") == "0123456789"); 549 550 string e; 551 552 e = percentEncode("☃"); 553 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 554 } 555 556 /** 557 * Percent-decode a string. 558 * 559 * URL components cannot contain non-ASCII characters, and there are very few characters that are 560 * safe to include as URL components. Domain names using Unicode values use Punycode. For 561 * everything else, there is percent encoding. 562 * 563 * This explicitly ensures that the result is a valid UTF-8 string. 564 */ 565 string percentDecode(string encoded) { 566 ubyte[] raw = percentDecodeRaw(encoded); 567 auto s = cast(string) raw; 568 if (!s.isValid) { 569 // TODO(dhasenan): 570 throw new URLException("input contains invalid UTF data"); 571 } 572 return s; 573 } 574 575 /// 576 unittest { 577 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 578 assert(percentDecode("~~--..__") == "~~--..__"); 579 assert(percentDecode("0123456789") == "0123456789"); 580 581 string e; 582 583 e = percentDecode("%E2%98%83"); 584 assert(e == "☃", "expected a snowman but got" ~ e); 585 } 586 587 /** 588 * Percent-decode a string into a ubyte array. 589 * 590 * URL components cannot contain non-ASCII characters, and there are very few characters that are 591 * safe to include as URL components. Domain names using Unicode values use Punycode. For 592 * everything else, there is percent encoding. 593 * 594 * This yields a ubyte array and will not perform validation on the output. However, an improperly 595 * formatted input string will result in a URLException. 596 */ 597 ubyte[] percentDecodeRaw(string encoded) { 598 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 599 Appender!(ubyte[]) app; 600 for (int i = 0; i < encoded.length; i++) { 601 if (encoded[i] != '%') { 602 app ~= encoded[i]; 603 continue; 604 } 605 if (i >= encoded.length - 2) { 606 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 607 "percent symbol. Error at index " ~ i.to!string); 608 } 609 auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1])); 610 auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2])); 611 app ~= cast(ubyte)((b << 4) | c); 612 i += 2; 613 } 614 return app.data; 615 } 616 617 /++ 618 string toAscii(string unicodeHostname) { 619 bool mustEncode = false; 620 foreach (i, dchar d; unicodeHostname) { 621 auto c = cast(uint) d; 622 if (c > 0x80) { 623 mustEncode = true; 624 break; 625 } 626 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 627 throw new URLException( 628 format( 629 "domain name '%s' contains illegal character '%s' at position %s", 630 unicodeHostname, d, i)); 631 } 632 } 633 if (!mustEncode) { 634 return unicodeHostname; 635 } 636 auto parts = unicodeHostname.split('.'); 637 char[] result; 638 foreach (part; parts) { 639 result ~= punyEncode(part); 640 } 641 return cast(string)result; 642 } 643 644 string punyEncode(string item, string delimiter = null, string marker = null) { 645 // Puny state machine initial variables. 646 auto base = 36; 647 auto tmin = 1; 648 auto tmax = 26; 649 auto skew = 38; 650 auto damp = 700; 651 auto initialBias = 72; 652 long b = 0; 653 654 bool needToEncode = false; 655 Appender!(char[]) app; 656 app ~= marker; 657 foreach (dchar d; item) { 658 if (d > '~') { // Max printable ASCII. The DEL char isn't allowed in hostnames. 659 needToEncode = true; 660 } else { 661 app ~= d; 662 b++; 663 } 664 } 665 if (!needToEncode) { 666 return item; 667 } 668 app ~= delimiter; 669 670 // The puny algorithm. 671 // We use 64-bit arithmetic to avoid overflow issues -- unicode only defines up to 0x10FFFF, 672 // and we won't be encoding gigabytes of data, but just to be safe. 673 // Also we use signed values just to make things easier. 674 long delta = 0; 675 long bias = initialBias; 676 long h = b; 677 long lastIndex = 0; 678 679 dchar digitToBasic(ulong digit) { 680 if (digit < 26) { 681 return 'a' + cast(dchar)digit; 682 } 683 return cast(dchar)('0' + (digit - 26)); 684 } 685 686 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 687 auto k = 0; 688 delta = firstTime ? (delta / damp) : delta >> 1; 689 delta += (delta / numPoints); 690 for (; delta > (base - tmin) * tmax >> 1; k += base) { 691 delta = (delta / (base - tmin)); 692 } 693 return k + (base - tmin + 1) * delta / (delta + skew); 694 } 695 696 auto f = filter!(x => x >= cast(dchar)128)(item).array; 697 auto uniqueChars = uniq(std.algorithm.sorting.sort(f)); 698 foreach (dchar n; uniqueChars) { 699 foreach (dchar c; item) { 700 if (c < n) { 701 delta++; 702 } else if (c == n) { 703 auto q = delta; 704 for (ulong k = 0; k < cast(ulong)uint.max; k += base) { 705 auto t = k <= bias ? tmin : (k >= bias + tmax ? tmax : k - bias); 706 if (q < t) { 707 break; 708 } 709 app ~= digitToBasic(t + ((q - t) % (base - t))); 710 q = (q - t) / (base - t); 711 } 712 app ~= digitToBasic(q); 713 bias = adapt(delta, h + 1, h == b); 714 h++; 715 } 716 } 717 delta++; 718 } 719 return cast(string)app.data; 720 } 721 722 unittest { 723 import std.stdio; 724 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 725 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 726 writeln(a); 727 writeln(punyEncode(a)); 728 assert(punyEncode(a) == "egbpdaj6bu4bxfgehfvwxn"); 729 } 730 731 struct URL { 732 Host host; 733 } 734 ++/