1 /** 2 * A URL handling library. 3 * 4 * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 * elements like port, path, username, and password. 6 * 7 * This module aims to make it simple to muck about with them. 8 * 9 * Example usage: 10 * --- 11 * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 * auto files = system("ssh", url.toString, "ls").splitLines; 13 * foreach (file; files) { 14 * system("scp", url ~ file, "."); 15 * } 16 * --- 17 * 18 * License: The MIT license. 19 */ 20 module url; 21 22 import std.algorithm; 23 import std.array; 24 import std.conv; 25 import std.encoding; 26 import std.string; 27 import std.utf; 28 29 @safe: 30 31 /// An exception thrown when something bad happens with URLs. 32 class URLException : Exception { 33 this(string msg) { super(msg); } 34 } 35 36 /** 37 * A mapping from schemes to their default ports. 38 * 39 * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 40 * use even if they use ports. Entries here should be treated as best guesses. 41 */ 42 ushort[string] schemeToDefaultPort; 43 44 static this() { 45 schemeToDefaultPort = [ 46 "aaa": 3868, 47 "aaas": 5658, 48 "acap": 674, 49 "amqp": 5672, 50 "cap": 1026, 51 "coap": 5683, 52 "coaps": 5684, 53 "dav": 443, 54 "dict": 2628, 55 "ftp": 21, 56 "git": 9418, 57 "go": 1096, 58 "gopher": 70, 59 "http": 80, 60 "https": 443, 61 "ws": 80, 62 "wss": 443, 63 "iac": 4569, 64 "icap": 1344, 65 "imap": 143, 66 "ipp": 631, 67 "ipps": 631, // yes, they're both mapped to port 631 68 "irc": 6667, // De facto default port, not the IANA reserved port. 69 "ircs": 6697, 70 "iris": 702, // defaults to iris.beep 71 "iris.beep": 702, 72 "iris.lwz": 715, 73 "iris.xpc": 713, 74 "iris.xpcs": 714, 75 "jabber": 5222, // client-to-server 76 "ldap": 389, 77 "ldaps": 636, 78 "msrp": 2855, 79 "msrps": 2855, 80 "mtqp": 1038, 81 "mupdate": 3905, 82 "news": 119, 83 "nfs": 2049, 84 "pop": 110, 85 "redis": 6379, 86 "reload": 6084, 87 "rsync": 873, 88 "rtmfp": 1935, 89 "rtsp": 554, 90 "shttp": 80, 91 "sieve": 4190, 92 "sip": 5060, 93 "sips": 5061, 94 "smb": 445, 95 "smtp": 25, 96 "snews": 563, 97 "snmp": 161, 98 "soap.beep": 605, 99 "ssh": 22, 100 "stun": 3478, 101 "stuns": 5349, 102 "svn": 3690, 103 "teamspeak": 9987, 104 "telnet": 23, 105 "tftp": 69, 106 "tip": 3372, 107 ]; 108 } 109 110 /** 111 * A collection of query parameters. 112 * 113 * This is effectively a multimap of string -> strings. 114 */ 115 struct QueryParams { 116 import std.typecons; 117 alias Tuple!(string, "key", string, "value") Param; 118 Param[] params; 119 120 @property size_t length() { 121 return params.length; 122 } 123 124 /// Get a range over the query parameter values for the given key. 125 auto opIndex(string key) { 126 return params.find!(x => x.key == key).map!(x => x.value); 127 } 128 129 /// Add a query parameter with the given key and value. 130 /// If one already exists, there will now be two query parameters with the given name. 131 void add(string key, string value) { 132 params ~= Param(key, value); 133 } 134 135 /// Add a query parameter with the given key and value. 136 /// If there are any existing parameters with the same key, they are removed and overwritten. 137 void overwrite(string key, string value) { 138 for (int i = 0; i < params.length; i++) { 139 if (params[i].key == key) { 140 params[i] = params[$-1]; 141 params.length--; 142 } 143 } 144 params ~= Param(key, value); 145 } 146 147 private struct QueryParamRange { 148 size_t i; 149 const(Param)[] params; 150 bool empty() { return i >= params.length; } 151 void popFront() { i++; } 152 Param front() { return params[i]; } 153 } 154 155 /** 156 * A range over the query parameters. 157 * 158 * Usage: 159 * --- 160 * foreach (key, value; url.queryParams) {} 161 * --- 162 */ 163 auto range() { 164 return QueryParamRange(0, this.params); 165 } 166 /// ditto 167 alias range this; 168 } 169 170 /** 171 * A Unique Resource Locator. 172 * 173 * URLs can be parsed (see parseURL) and implicitly convert to strings. 174 */ 175 struct URL { 176 /// The URL scheme. For instance, ssh, ftp, or https. 177 string scheme; 178 179 /// The username in this URL. Usually absent. If present, there will also be a password. 180 string user; 181 182 /// The password in this URL. Usually absent. 183 string pass; 184 185 /// The hostname. 186 string host; 187 188 /** 189 * The port. 190 * 191 * This is inferred from the scheme if it isn't present in the URL itself. 192 * If the scheme is not known and the port is not present, the port will be given as 0. 193 * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 194 * 195 * If you explicitly need to detect whether the user provided a port, check the providedPort 196 * field. 197 */ 198 @property ushort port() { 199 if (providedPort != 0) { 200 return providedPort; 201 } 202 if (auto p = scheme in schemeToDefaultPort) { 203 return *p; 204 } 205 return 0; 206 } 207 208 /** 209 * Set the port. 210 * 211 * This sets the providedPort field and is provided for convenience. 212 */ 213 @property ushort port(ushort value) { 214 return providedPort = value; 215 } 216 217 /// The port that was explicitly provided in the URL. 218 ushort providedPort; 219 220 /** 221 * The path. 222 * 223 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 224 * "/news/story/17774". 225 */ 226 string path; 227 228 /** 229 * Deprecated: this disallows multiple values for the same query string. Please use queryParams 230 * instead. 231 * 232 * The query string elements. 233 * 234 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string 235 * elements will be ["visited": "false"]. 236 * 237 * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be 238 * ["item": ""]. 239 * 240 * This field is mutable, so be cautious. 241 */ 242 deprecated("use queryParams") string[string] query; 243 244 /** 245 * The query parameters associated with this URL. 246 */ 247 QueryParams queryParams; 248 249 /** 250 * The fragment. In web documents, this typically refers to an anchor element. 251 * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 252 */ 253 string fragment; 254 255 /** 256 * Convert this URL to a string. 257 * The string is properly formatted and usable for, eg, a web request. 258 */ 259 string toString() { 260 return toString(false); 261 } 262 263 /** 264 * Convert this URL to a string. 265 * The string is intended to be human-readable rather than machine-readable. 266 */ 267 string toHumanReadableString() { 268 return toString(true); 269 } 270 271 private string toString(bool humanReadable) { 272 Appender!string s; 273 s ~= scheme; 274 s ~= "://"; 275 if (user) { 276 s ~= humanReadable ? user : user.percentEncode; 277 s ~= ":"; 278 s ~= humanReadable ? pass : pass.percentEncode; 279 s ~= "@"; 280 } 281 s ~= humanReadable ? host : host.toPuny; 282 if (providedPort) { 283 if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 284 s ~= ":"; 285 s ~= providedPort.to!string; 286 } 287 } 288 string p = path; 289 if (p.length == 0 || p == "/") { 290 s ~= '/'; 291 } else { 292 if (p[0] == '/') { 293 p = p[1..$]; 294 } 295 if (humanReadable) { 296 s ~= p; 297 } else { 298 foreach (part; p.split('/')) { 299 s ~= '/'; 300 s ~= part.percentEncode; 301 } 302 } 303 } 304 if (queryParams.length) { 305 bool first = true; 306 s ~= '?'; 307 foreach (k, v; queryParams) { 308 if (!first) { 309 s ~= '&'; 310 } 311 first = false; 312 s ~= k.percentEncode; 313 if (v.length > 0) { 314 s ~= '='; 315 s ~= v.percentEncode; 316 } 317 } 318 } else if (query) { 319 s ~= '?'; 320 bool first = true; 321 foreach (k, v; query) { 322 if (!first) { 323 s ~= '&'; 324 } 325 first = false; 326 s ~= k.percentEncode; 327 if (v.length > 0) { 328 s ~= '='; 329 s ~= v.percentEncode; 330 } 331 } 332 } 333 if (fragment) { 334 s ~= '#'; 335 s ~= fragment.percentEncode; 336 } 337 return s.data; 338 } 339 340 /// Implicitly convert URLs to strings. 341 alias toString this; 342 343 /** 344 * The append operator (~). 345 * 346 * The append operator for URLs returns a new URL with the given string appended as a path 347 * element to the URL's path. It only adds new path elements (or sequences of path elements). 348 * 349 * Don't worry about path separators; whether you include them or not, it will just work. 350 * 351 * Query elements are copied. 352 * 353 * Examples: 354 * --- 355 * auto random = "http://testdata.org/random".parseURL; 356 * auto randInt = random ~ "int"; 357 * writeln(randInt); // prints "http://testdata.org/random/int" 358 * --- 359 */ 360 URL opBinary(string op : "~")(string subsequentPath) { 361 URL other = this; 362 other ~= subsequentPath; 363 if (query) { 364 other.query = other.query.dup; 365 } 366 return other; 367 } 368 369 /** 370 * The append-in-place operator (~=). 371 * 372 * The append operator for URLs adds a path element to this URL. It only adds new path elements 373 * (or sequences of path elements). 374 * 375 * Don't worry about path separators; whether you include them or not, it will just work. 376 * 377 * Examples: 378 * --- 379 * auto random = "http://testdata.org/random".parseURL; 380 * random ~= "int"; 381 * writeln(random); // prints "http://testdata.org/random/int" 382 * --- 383 */ 384 URL opOpAssign(string op : "~")(string subsequentPath) { 385 if (path.endsWith("/")) { 386 if (subsequentPath.startsWith("/")) { 387 path ~= subsequentPath[1..$]; 388 } else { 389 path ~= subsequentPath; 390 } 391 } else { 392 if (!subsequentPath.startsWith("/")) { 393 path ~= '/'; 394 } 395 path ~= subsequentPath; 396 } 397 return this; 398 } 399 } 400 401 /** 402 * Parse a URL from a string. 403 * 404 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 405 * may be made. However, any URL in a correct format will be parsed correctly. 406 */ 407 bool tryParseURL(string value, out URL url) { 408 url = URL.init; 409 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 410 // Scheme is optional in common use. We infer 'http' if it's not given. 411 auto i = value.indexOf("//"); 412 if (i > -1) { 413 if (i > 1) { 414 url.scheme = value[0..i-1]; 415 } 416 value = value[i+2 .. $]; 417 } else { 418 url.scheme = "http"; 419 } 420 // [user:password@]host[:port]][/]path[?query][#fragment 421 i = value.indexOfAny([':', '/']); 422 if (i == -1) { 423 // Just a hostname. 424 url.host = value.fromPuny; 425 return true; 426 } 427 428 if (value[i] == ':') { 429 // This could be between username and password, or it could be between host and port. 430 auto j = value.indexOfAny(['@', '/']); 431 if (j > -1 && value[j] == '@') { 432 try { 433 url.user = value[0..i].percentDecode; 434 url.pass = value[i+1 .. j].percentDecode; 435 } catch (URLException) { 436 return false; 437 } 438 value = value[j+1 .. $]; 439 } 440 } 441 442 // It's trying to be a host/port, not a user/pass. 443 i = value.indexOfAny([':', '/']); 444 if (i == -1) { 445 url.host = value.fromPuny; 446 return true; 447 } 448 url.host = value[0..i].fromPuny; 449 value = value[i .. $]; 450 if (value[0] == ':') { 451 auto end = value.indexOf('/'); 452 if (end == -1) { 453 end = value.length; 454 } 455 try { 456 url.port = value[1 .. end].to!ushort; 457 } catch (ConvException) { 458 return false; 459 } 460 value = value[end .. $]; 461 if (value.length == 0) { 462 return true; 463 } 464 } 465 466 i = value.indexOfAny("?#"); 467 if (i == -1) { 468 url.path = value.percentDecode; 469 return true; 470 } 471 472 try { 473 url.path = value[0..i].percentDecode; 474 } catch (URLException) { 475 return false; 476 } 477 auto c = value[i]; 478 value = value[i + 1 .. $]; 479 if (c == '?') { 480 i = value.indexOf('#'); 481 string query; 482 if (i < 0) { 483 query = value; 484 value = null; 485 } else { 486 query = value[0..i]; 487 value = value[i + 1 .. $]; 488 } 489 auto queries = query.split('&'); 490 foreach (q; queries) { 491 auto j = q.indexOf('='); 492 string key, val; 493 if (j < 0) { 494 key = q; 495 } else { 496 key = q[0..j]; 497 val = q[j + 1 .. $]; 498 } 499 try { 500 key = key.percentDecode; 501 val = val.percentDecode; 502 } catch (URLException) { 503 return false; 504 } 505 url.query[key] = val; 506 url.queryParams.add(key, val); 507 } 508 } 509 510 try { 511 url.fragment = value.percentDecode; 512 } catch (URLException) { 513 return false; 514 } 515 516 return true; 517 } 518 519 unittest { 520 { 521 // Basic. 522 URL url; 523 with (url) { 524 scheme = "https"; 525 host = "example.org"; 526 path = "/foo/bar"; 527 query["hello"] = "world"; 528 query["gibe"] = "clay"; 529 fragment = "frag"; 530 } 531 assert( 532 // Not sure what order it'll come out in. 533 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 534 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 535 url.toString); 536 } 537 { 538 // Percent encoded. 539 URL url; 540 with (url) { 541 scheme = "https"; 542 host = "example.org"; 543 path = "/f☃o"; 544 query["❄"] = "❀"; 545 query["["] = "]"; 546 fragment = "ş"; 547 } 548 assert( 549 // Not sure what order it'll come out in. 550 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 551 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 552 url.toString); 553 } 554 { 555 // Port, user, pass. 556 URL url; 557 with (url) { 558 scheme = "https"; 559 host = "example.org"; 560 user = "dhasenan"; 561 pass = "itsasecret"; 562 port = 17; 563 } 564 assert( 565 url.toString == "https://dhasenan:itsasecret@example.org:17/", 566 url.toString); 567 } 568 { 569 // Query with no path. 570 URL url; 571 with (url) { 572 scheme = "https"; 573 host = "example.org"; 574 query["hi"] = "bye"; 575 } 576 assert( 577 url.toString == "https://example.org/?hi=bye", 578 url.toString); 579 } 580 } 581 582 unittest 583 { 584 auto url = "//foo/bar".parseURL; 585 assert(url.host == "foo", "expected host foo, got " ~ url.host); 586 assert(url.path == "/bar"); 587 } 588 589 unittest 590 { 591 auto url = "localhost:5984".parseURL; 592 auto url2 = url ~ "db1"; 593 assert(url2.toString == "http://localhost:5984/db1", url2.toString); 594 auto url3 = url2 ~ "_all_docs"; 595 assert(url3.toString == "http://localhost:5984/db1/_all_docs", url3.toString); 596 } 597 598 /// 599 unittest { 600 { 601 // Basic. 602 URL url; 603 with (url) { 604 scheme = "https"; 605 host = "example.org"; 606 path = "/foo/bar"; 607 queryParams.add("hello", "world"); 608 queryParams.add("gibe", "clay"); 609 fragment = "frag"; 610 } 611 assert( 612 // Not sure what order it'll come out in. 613 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 614 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 615 url.toString); 616 } 617 { 618 // Passing an array of query values. 619 URL url; 620 with (url) { 621 scheme = "https"; 622 host = "example.org"; 623 path = "/foo/bar"; 624 queryParams.add("hello", "world"); 625 queryParams.add("hello", "aether"); 626 fragment = "frag"; 627 } 628 assert( 629 // Not sure what order it'll come out in. 630 url.toString == "https://example.org/foo/bar?hello=world&hello=aether#frag" || 631 url.toString == "https://example.org/foo/bar?hello=aether&hello=world#frag", 632 url.toString); 633 } 634 { 635 // Percent encoded. 636 URL url; 637 with (url) { 638 scheme = "https"; 639 host = "example.org"; 640 path = "/f☃o"; 641 queryParams.add("❄", "❀"); 642 queryParams.add("[", "]"); 643 fragment = "ş"; 644 } 645 assert( 646 // Not sure what order it'll come out in. 647 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 648 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 649 url.toString); 650 } 651 { 652 // Port, user, pass. 653 URL url; 654 with (url) { 655 scheme = "https"; 656 host = "example.org"; 657 user = "dhasenan"; 658 pass = "itsasecret"; 659 port = 17; 660 } 661 assert( 662 url.toString == "https://dhasenan:itsasecret@example.org:17/", 663 url.toString); 664 } 665 { 666 // Query with no path. 667 URL url; 668 with (url) { 669 scheme = "https"; 670 host = "example.org"; 671 queryParams.add("hi", "bye"); 672 } 673 assert( 674 url.toString == "https://example.org/?hi=bye", 675 url.toString); 676 } 677 } 678 679 unittest { 680 // Percent decoding. 681 682 // http://#:!:@ 683 auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash"; 684 auto url = urlString.parseURL; 685 assert(url.user == "#"); 686 assert(url.pass == "!:"); 687 assert(url.host == "example.org"); 688 assert(url.path == "/{/}"); 689 assert(url.queryParams[";"].front == ""); 690 assert(url.queryParams["&"].front == "="); 691 assert(url.fragment == "#hash"); 692 693 // Round trip. 694 assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 695 assert(urlString == urlString.parseURL.toString.parseURL.toString); 696 } 697 698 unittest { 699 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 700 assert(url.host == "☂.☃.org", url.host); 701 } 702 703 unittest { 704 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 705 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 706 assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 707 } 708 709 unittest { 710 auto url = "https://☂.☃.org/?hi=bye".parseURL; 711 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 712 } 713 714 /// 715 unittest { 716 // There's an existing path. 717 auto url = parseURL("http://example.org/foo"); 718 URL url2; 719 // No slash? Assume it needs a slash. 720 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 721 // With slash? Don't add another. 722 url2 = url ~ "/bar"; 723 assert(url2.toString == "http://example.org/foo/bar", url2.toString); 724 url ~= "bar"; 725 assert(url.toString == "http://example.org/foo/bar"); 726 727 // Path already ends with a slash; don't add another. 728 url = parseURL("http://example.org/foo/"); 729 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 730 // Still don't add one even if you're appending with a slash. 731 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 732 url ~= "/bar"; 733 assert(url.toString == "http://example.org/foo/bar"); 734 735 // No path. 736 url = parseURL("http://example.org"); 737 assert((url ~ "bar").toString == "http://example.org/bar"); 738 assert((url ~ "/bar").toString == "http://example.org/bar"); 739 url ~= "bar"; 740 assert(url.toString == "http://example.org/bar"); 741 742 // Path is just a slash. 743 url = parseURL("http://example.org/"); 744 assert((url ~ "bar").toString == "http://example.org/bar"); 745 assert((url ~ "/bar").toString == "http://example.org/bar"); 746 url ~= "bar"; 747 assert(url.toString == "http://example.org/bar", url.toString); 748 749 // No path, just fragment. 750 url = "ircs://irc.freenode.com/#d".parseURL; 751 assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 752 } 753 754 unittest { 755 import std.net.curl; 756 auto url = "http://example.org".parseURL; 757 assert(is(typeof(std.net.curl.get(url)))); 758 } 759 760 /** 761 * Parse the input string as a URL. 762 * 763 * Throws: 764 * URLException if the string was in an incorrect format. 765 */ 766 URL parseURL(string value) { 767 URL url; 768 if (tryParseURL(value, url)) { 769 return url; 770 } 771 throw new URLException("failed to parse URL " ~ value); 772 } 773 774 /// 775 unittest { 776 { 777 // Infer scheme 778 auto u1 = parseURL("example.org"); 779 assert(u1.scheme == "http"); 780 assert(u1.host == "example.org"); 781 assert(u1.path == ""); 782 assert(u1.port == 80); 783 assert(u1.providedPort == 0); 784 assert(u1.fragment == ""); 785 } 786 { 787 // Simple host and scheme 788 auto u1 = parseURL("https://example.org"); 789 assert(u1.scheme == "https"); 790 assert(u1.host == "example.org"); 791 assert(u1.path == ""); 792 assert(u1.port == 443); 793 assert(u1.providedPort == 0); 794 } 795 { 796 // With path 797 auto u1 = parseURL("https://example.org/foo/bar"); 798 assert(u1.scheme == "https"); 799 assert(u1.host == "example.org"); 800 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 801 assert(u1.port == 443); 802 assert(u1.providedPort == 0); 803 } 804 { 805 // With explicit port 806 auto u1 = parseURL("https://example.org:1021/foo/bar"); 807 assert(u1.scheme == "https"); 808 assert(u1.host == "example.org"); 809 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 810 assert(u1.port == 1021); 811 assert(u1.providedPort == 1021); 812 } 813 { 814 // With user 815 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 816 assert(u1.scheme == "https"); 817 assert(u1.host == "example.org"); 818 assert(u1.path == "/foo/bar"); 819 assert(u1.port == 443); 820 assert(u1.user == "bob"); 821 assert(u1.pass == "secret"); 822 } 823 { 824 // With user, URL-encoded 825 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 826 assert(u1.scheme == "https"); 827 assert(u1.host == "example.org"); 828 assert(u1.path == "/foo/bar"); 829 assert(u1.port == 443); 830 assert(u1.user == "bob!"); 831 assert(u1.pass == "secret!?"); 832 } 833 { 834 // With user and port and path 835 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 836 assert(u1.scheme == "https"); 837 assert(u1.host == "example.org"); 838 assert(u1.path == "/foo/bar"); 839 assert(u1.port == 2210); 840 assert(u1.user == "bob"); 841 assert(u1.pass == "secret"); 842 assert(u1.fragment == ""); 843 } 844 { 845 // With query string 846 auto u1 = parseURL("https://example.org/?login=true"); 847 assert(u1.scheme == "https"); 848 assert(u1.host == "example.org"); 849 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 850 assert(u1.queryParams["login"].front == "true"); 851 assert(u1.fragment == ""); 852 } 853 { 854 // With query string and fragment 855 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 856 assert(u1.scheme == "https"); 857 assert(u1.host == "example.org"); 858 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 859 assert(u1.queryParams["login"].front == "true"); 860 assert(u1.fragment == "justkidding"); 861 } 862 { 863 // With URL-encoded values 864 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 865 assert(u1.scheme == "https"); 866 assert(u1.host == "example.org"); 867 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 868 assert(u1.queryParams["❄"].front == "="); 869 assert(u1.fragment == "^"); 870 } 871 } 872 873 unittest { 874 assert(parseURL("http://example.org").port == 80); 875 assert(parseURL("http://example.org:5326").port == 5326); 876 877 auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 878 assert(url.scheme == "redis"); 879 assert(url.user == "admin"); 880 assert(url.pass == "password"); 881 882 assert(parseURL("example.org").toString == "http://example.org/"); 883 assert(parseURL("http://example.org:80").toString == "http://example.org/"); 884 885 assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 886 } 887 888 /** 889 * Percent-encode a string. 890 * 891 * URL components cannot contain non-ASCII characters, and there are very few characters that are 892 * safe to include as URL components. Domain names using Unicode values use Punycode. For 893 * everything else, there is percent encoding. 894 */ 895 string percentEncode(string raw) { 896 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 897 // We *can* encode any other characters. 898 // We *should not* encode alpha, numeric, or -._~. 899 Appender!string app; 900 foreach (dchar d; raw) { 901 if (('a' <= d && 'z' >= d) || 902 ('A' <= d && 'Z' >= d) || 903 ('0' <= d && '9' >= d) || 904 d == '-' || d == '.' || d == '_' || d == '~') { 905 app ~= d; 906 continue; 907 } 908 // Something simple like a space character? Still in 7-bit ASCII? 909 // Then we get a single-character string out of it and just encode 910 // that one bit. 911 // Something not in 7-bit ASCII? Then we percent-encode each octet 912 // in the UTF-8 encoding (and hope the server understands UTF-8). 913 char[] c; 914 encode(c, d); 915 auto bytes = cast(ubyte[])c; 916 foreach (b; bytes) { 917 app ~= format("%%%02X", b); 918 } 919 } 920 return cast(string)app.data; 921 } 922 923 /// 924 unittest { 925 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 926 assert(percentEncode("~~--..__") == "~~--..__"); 927 assert(percentEncode("0123456789") == "0123456789"); 928 929 string e; 930 931 e = percentEncode("☃"); 932 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 933 } 934 935 /** 936 * Percent-decode a string. 937 * 938 * URL components cannot contain non-ASCII characters, and there are very few characters that are 939 * safe to include as URL components. Domain names using Unicode values use Punycode. For 940 * everything else, there is percent encoding. 941 * 942 * This explicitly ensures that the result is a valid UTF-8 string. 943 */ 944 @trusted string percentDecode(string encoded) { 945 ubyte[] raw = percentDecodeRaw(encoded); 946 // This cast is not considered @safe because it converts from one pointer type to another. 947 // However, it's 1-byte values in either case, no reference types, so this won't result in any 948 // memory safety errors. We also check for validity immediately. 949 auto s = cast(string) raw; 950 if (!s.isValid) { 951 // TODO(dhasenan): 952 throw new URLException("input contains invalid UTF data"); 953 } 954 return s; 955 } 956 957 /// 958 unittest { 959 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 960 assert(percentDecode("~~--..__") == "~~--..__"); 961 assert(percentDecode("0123456789") == "0123456789"); 962 963 string e; 964 965 e = percentDecode("%E2%98%83"); 966 assert(e == "☃", "expected a snowman but got" ~ e); 967 968 e = percentDecode("%e2%98%83"); 969 assert(e == "☃", "expected a snowman but got" ~ e); 970 971 try { 972 // %ES is an invalid percent sequence: 'S' is not a hex digit. 973 percentDecode("%es"); 974 assert(false, "expected exception not thrown"); 975 } catch (URLException) { 976 } 977 978 try { 979 percentDecode("%e"); 980 assert(false, "expected exception not thrown"); 981 } catch (URLException) { 982 } 983 } 984 985 /** 986 * Percent-decode a string into a ubyte array. 987 * 988 * URL components cannot contain non-ASCII characters, and there are very few characters that are 989 * safe to include as URL components. Domain names using Unicode values use Punycode. For 990 * everything else, there is percent encoding. 991 * 992 * This yields a ubyte array and will not perform validation on the output. However, an improperly 993 * formatted input string will result in a URLException. 994 */ 995 ubyte[] percentDecodeRaw(string encoded) { 996 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 997 Appender!(ubyte[]) app; 998 for (int i = 0; i < encoded.length; i++) { 999 if (encoded[i] != '%') { 1000 app ~= encoded[i]; 1001 continue; 1002 } 1003 if (i >= encoded.length - 2) { 1004 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 1005 "percent symbol. Error at index " ~ i.to!string); 1006 } 1007 if (isHex(encoded[i + 1]) && isHex(encoded[i + 2])) { 1008 auto b = fromHex(encoded[i + 1]); 1009 auto c = fromHex(encoded[i + 2]); 1010 app ~= cast(ubyte)((b << 4) | c); 1011 } else { 1012 throw new URLException("Invalid percent encoded value: expected two hex digits after " ~ 1013 "percent symbol. Error at index " ~ i.to!string); 1014 } 1015 i += 2; 1016 } 1017 return app.data; 1018 } 1019 1020 private bool isHex(char c) { 1021 return ('0' <= c && '9' >= c) || 1022 ('a' <= c && 'f' >= c) || 1023 ('A' <= c && 'F' >= c); 1024 } 1025 1026 private ubyte fromHex(char s) { 1027 enum caseDiff = 'a' - 'A'; 1028 if (s >= 'a' && s <= 'z') { 1029 s -= caseDiff; 1030 } 1031 return cast(ubyte)("0123456789ABCDEF".indexOf(s)); 1032 } 1033 1034 private string toPuny(string unicodeHostname) { 1035 bool mustEncode = false; 1036 foreach (i, dchar d; unicodeHostname) { 1037 auto c = cast(uint) d; 1038 if (c > 0x80) { 1039 mustEncode = true; 1040 break; 1041 } 1042 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 1043 throw new URLException( 1044 format( 1045 "domain name '%s' contains illegal character '%s' at position %s", 1046 unicodeHostname, d, i)); 1047 } 1048 } 1049 if (!mustEncode) { 1050 return unicodeHostname; 1051 } 1052 return unicodeHostname.split('.').map!punyEncode.join("."); 1053 } 1054 1055 private string fromPuny(string hostname) { 1056 return hostname.split('.').map!punyDecode.join("."); 1057 } 1058 1059 private { 1060 enum delimiter = '-'; 1061 enum marker = "xn--"; 1062 enum ulong damp = 700; 1063 enum ulong tmin = 1; 1064 enum ulong tmax = 26; 1065 enum ulong skew = 38; 1066 enum ulong base = 36; 1067 enum ulong initialBias = 72; 1068 enum dchar initialN = cast(dchar)128; 1069 1070 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 1071 if (firstTime) { 1072 delta /= damp; 1073 } else { 1074 delta /= 2; 1075 } 1076 delta += delta / numPoints; 1077 ulong k = 0; 1078 while (delta > ((base - tmin) * tmax) / 2) { 1079 delta /= (base - tmin); 1080 k += base; 1081 } 1082 return k + (((base - tmin + 1) * delta) / (delta + skew)); 1083 } 1084 } 1085 1086 /** 1087 * Encode the input string using the Punycode algorithm. 1088 * 1089 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1090 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1091 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1092 * 1093 * In order to puny-encode a domain name, you must split it into its components. The following will 1094 * typically suffice: 1095 * --- 1096 * auto domain = "☂.☃.com"; 1097 * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1098 * --- 1099 */ 1100 string punyEncode(string input) { 1101 ulong delta = 0; 1102 dchar n = initialN; 1103 auto i = 0; 1104 auto bias = initialBias; 1105 Appender!string output; 1106 output ~= marker; 1107 auto pushed = 0; 1108 auto codePoints = 0; 1109 foreach (dchar c; input) { 1110 codePoints++; 1111 if (c <= initialN) { 1112 output ~= c; 1113 pushed++; 1114 } 1115 } 1116 if (pushed < codePoints) { 1117 if (pushed > 0) { 1118 output ~= delimiter; 1119 } 1120 } else { 1121 // No encoding to do. 1122 return input; 1123 } 1124 bool first = true; 1125 while (pushed < codePoints) { 1126 auto best = dchar.max; 1127 foreach (dchar c; input) { 1128 if (n <= c && c < best) { 1129 best = c; 1130 } 1131 } 1132 if (best == dchar.max) { 1133 throw new URLException("failed to find a new codepoint to process during punyencode"); 1134 } 1135 delta += (best - n) * (pushed + 1); 1136 if (delta > uint.max) { 1137 // TODO better error message 1138 throw new URLException("overflow during punyencode"); 1139 } 1140 n = best; 1141 foreach (dchar c; input) { 1142 if (c < n) { 1143 delta++; 1144 } 1145 if (c == n) { 1146 ulong q = delta; 1147 auto k = base; 1148 while (true) { 1149 ulong t; 1150 if (k <= bias) { 1151 t = tmin; 1152 } else if (k >= bias + tmax) { 1153 t = tmax; 1154 } else { 1155 t = k - bias; 1156 } 1157 if (q < t) { 1158 break; 1159 } 1160 output ~= digitToBasic(t + ((q - t) % (base - t))); 1161 q = (q - t) / (base - t); 1162 k += base; 1163 } 1164 output ~= digitToBasic(q); 1165 pushed++; 1166 bias = adapt(delta, pushed, first); 1167 first = false; 1168 delta = 0; 1169 } 1170 } 1171 delta++; 1172 n++; 1173 } 1174 return cast(string)output.data; 1175 } 1176 1177 /** 1178 * Decode the input string using the Punycode algorithm. 1179 * 1180 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1181 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1182 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1183 * 1184 * In order to puny-decode a domain name, you must split it into its components. The following will 1185 * typically suffice: 1186 * --- 1187 * auto domain = "xn--m3h.xn--n3h.com"; 1188 * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1189 * --- 1190 */ 1191 string punyDecode(string input) { 1192 if (!input.startsWith(marker)) { 1193 return input; 1194 } 1195 input = input[marker.length..$]; 1196 1197 // let n = initial_n 1198 dchar n = cast(dchar)128; 1199 1200 // let i = 0 1201 // let bias = initial_bias 1202 // let output = an empty string indexed from 0 1203 ulong i = 0; 1204 auto bias = initialBias; 1205 dchar[] output; 1206 // This reserves a bit more than necessary, but it should be more efficient overall than just 1207 // appending and inserting volo-nolo. 1208 output.reserve(input.length); 1209 1210 // consume all code points before the last delimiter (if there is one) 1211 // and copy them to output, fail on any non-basic code point 1212 // if more than zero code points were consumed then consume one more 1213 // (which will be the last delimiter) 1214 auto end = input.lastIndexOf(delimiter); 1215 if (end > -1) { 1216 foreach (dchar c; input[0..end]) { 1217 output ~= c; 1218 } 1219 input = input[end+1 .. $]; 1220 } 1221 1222 // while the input is not exhausted do begin 1223 ulong pos = 0; 1224 while (pos < input.length) { 1225 // let oldi = i 1226 // let w = 1 1227 auto oldi = i; 1228 auto w = 1; 1229 // for k = base to infinity in steps of base do begin 1230 for (ulong k = base; k < uint.max; k += base) { 1231 // consume a code point, or fail if there was none to consume 1232 // Note that the input is all ASCII, so we can simply index the input string bytewise. 1233 auto c = input[pos]; 1234 pos++; 1235 // let digit = the code point's digit-value, fail if it has none 1236 auto digit = basicToDigit(c); 1237 // let i = i + digit * w, fail on overflow 1238 i += digit * w; 1239 // let t = tmin if k <= bias {+ tmin}, or 1240 // tmax if k >= bias + tmax, or k - bias otherwise 1241 ulong t; 1242 if (k <= bias) { 1243 t = tmin; 1244 } else if (k >= bias + tmax) { 1245 t = tmax; 1246 } else { 1247 t = k - bias; 1248 } 1249 // if digit < t then break 1250 if (digit < t) { 1251 break; 1252 } 1253 // let w = w * (base - t), fail on overflow 1254 w *= (base - t); 1255 // end 1256 } 1257 // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1258 bias = adapt(i - oldi, output.length + 1, oldi == 0); 1259 // let n = n + i div (length(output) + 1), fail on overflow 1260 n += i / (output.length + 1); 1261 // let i = i mod (length(output) + 1) 1262 i %= (output.length + 1); 1263 // {if n is a basic code point then fail} 1264 // (We aren't actually going to fail here; it's clear what this means.) 1265 // insert n into output at position i 1266 (() @trusted { output.insertInPlace(i, cast(dchar)n); })(); // should be @safe but isn't marked 1267 // increment i 1268 i++; 1269 // end 1270 } 1271 return output.to!string; 1272 } 1273 1274 // Lifted from punycode.js. 1275 private dchar digitToBasic(ulong digit) { 1276 return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1277 } 1278 1279 // Lifted from punycode.js. 1280 private uint basicToDigit(char c) { 1281 auto codePoint = cast(uint)c; 1282 if (codePoint - 48 < 10) { 1283 return codePoint - 22; 1284 } 1285 if (codePoint - 65 < 26) { 1286 return codePoint - 65; 1287 } 1288 if (codePoint - 97 < 26) { 1289 return codePoint - 97; 1290 } 1291 return base; 1292 } 1293 1294 unittest { 1295 { 1296 auto a = "b\u00FCcher"; 1297 assert(punyEncode(a) == "xn--bcher-kva"); 1298 } 1299 { 1300 auto a = "b\u00FCc\u00FCher"; 1301 assert(punyEncode(a) == "xn--bcher-kvab"); 1302 } 1303 { 1304 auto a = "ýbücher"; 1305 auto b = punyEncode(a); 1306 assert(b == "xn--bcher-kvaf", b); 1307 } 1308 1309 { 1310 auto a = "mañana"; 1311 assert(punyEncode(a) == "xn--maana-pta"); 1312 } 1313 1314 { 1315 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1316 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1317 auto b = punyEncode(a); 1318 assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1319 } 1320 import std.stdio; 1321 } 1322 1323 unittest { 1324 { 1325 auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1326 assert(b == "ليهمابتكلموشعربي؟", b); 1327 } 1328 { 1329 assert(punyDecode("xn--maana-pta") == "mañana"); 1330 } 1331 } 1332 1333 unittest { 1334 import std.string, std.algorithm, std.array, std.range; 1335 { 1336 auto domain = "xn--m3h.xn--n3h.com"; 1337 auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1338 assert(decodedDomain == "☂.☃.com", decodedDomain); 1339 } 1340 { 1341 auto domain = "☂.☃.com"; 1342 auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1343 assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1344 } 1345 }