1 /** 2 * A URL handling library. 3 * 4 * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 * elements like port, path, username, and password. 6 * 7 * This module aims to make it simple to muck about with them. 8 * 9 * Example usage: 10 * --- 11 * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 * auto files = system("ssh", url.toString, "ls").splitLines; 13 * foreach (file; files) { 14 * system("scp", url ~ file, "."); 15 * } 16 * --- 17 * 18 * License: The MIT license. 19 */ 20 module url; 21 22 import std.algorithm; 23 import std.array; 24 import std.conv; 25 import std.encoding; 26 import std.string; 27 import std.utf; 28 29 @safe: 30 31 /// An exception thrown when something bad happens with URLs. 32 class URLException : Exception { 33 this(string msg) { super(msg); } 34 } 35 36 /** 37 * A mapping from schemes to their default ports. 38 * 39 * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 40 * use even if they use ports. Entries here should be treated as best guesses. 41 */ 42 ushort[string] schemeToDefaultPort; 43 44 static this() { 45 schemeToDefaultPort = [ 46 "aaa": 3868, 47 "aaas": 5658, 48 "acap": 674, 49 "amqp": 5672, 50 "cap": 1026, 51 "coap": 5683, 52 "coaps": 5684, 53 "dav": 443, 54 "dict": 2628, 55 "ftp": 21, 56 "git": 9418, 57 "go": 1096, 58 "gopher": 70, 59 "http": 80, 60 "https": 443, 61 "iac": 4569, 62 "icap": 1344, 63 "imap": 143, 64 "ipp": 631, 65 "ipps": 631, // yes, they're both mapped to port 631 66 "irc": 6667, // De facto default port, not the IANA reserved port. 67 "ircs": 6697, 68 "iris": 702, // defaults to iris.beep 69 "iris.beep": 702, 70 "iris.lwz": 715, 71 "iris.xpc": 713, 72 "iris.xpcs": 714, 73 "jabber": 5222, // client-to-server 74 "ldap": 389, 75 "ldaps": 636, 76 "msrp": 2855, 77 "msrps": 2855, 78 "mtqp": 1038, 79 "mupdate": 3905, 80 "news": 119, 81 "nfs": 2049, 82 "pop": 110, 83 "redis": 6379, 84 "reload": 6084, 85 "rsync": 873, 86 "rtmfp": 1935, 87 "rtsp": 554, 88 "shttp": 80, 89 "sieve": 4190, 90 "sip": 5060, 91 "sips": 5061, 92 "smb": 445, 93 "smtp": 25, 94 "snews": 563, 95 "snmp": 161, 96 "soap.beep": 605, 97 "ssh": 22, 98 "stun": 3478, 99 "stuns": 5349, 100 "svn": 3690, 101 "teamspeak": 9987, 102 "telnet": 23, 103 "tftp": 69, 104 "tip": 3372, 105 ]; 106 } 107 108 /** 109 * A collection of query parameters. 110 * 111 * This is effectively a multimap of string -> strings. 112 */ 113 struct QueryParams { 114 import std.typecons; 115 alias Tuple!(string, "key", string, "value") Param; 116 Param[] params; 117 118 @property size_t length() { 119 return params.length; 120 } 121 122 /// Get a range over the query parameter values for the given key. 123 auto opIndex(string key) { 124 return params.find!(x => x.key == key).map!(x => x.value); 125 } 126 127 /// Add a query parameter with the given key and value. 128 /// If one already exists, there will now be two query parameters with the given name. 129 void add(string key, string value) { 130 params ~= Param(key, value); 131 } 132 133 /// Add a query parameter with the given key and value. 134 /// If there are any existing parameters with the same key, they are removed and overwritten. 135 void overwrite(string key, string value) { 136 for (int i = 0; i < params.length; i++) { 137 if (params[i].key == key) { 138 params[i] = params[$-1]; 139 params.length--; 140 } 141 } 142 params ~= Param(key, value); 143 } 144 145 private struct QueryParamRange { 146 size_t i; 147 const(Param)[] params; 148 bool empty() { return i >= params.length; } 149 void popFront() { i++; } 150 Param front() { return params[i]; } 151 } 152 153 /** 154 * A range over the query parameters. 155 * 156 * Usage: 157 * --- 158 * foreach (key, value; url.queryParams) {} 159 * --- 160 */ 161 auto range() { 162 return QueryParamRange(0, this.params); 163 } 164 /// ditto 165 alias range this; 166 } 167 168 /** 169 * A Unique Resource Locator. 170 * 171 * URLs can be parsed (see parseURL) and implicitly convert to strings. 172 */ 173 struct URL { 174 /// The URL scheme. For instance, ssh, ftp, or https. 175 string scheme; 176 177 /// The username in this URL. Usually absent. If present, there will also be a password. 178 string user; 179 180 /// The password in this URL. Usually absent. 181 string pass; 182 183 /// The hostname. 184 string host; 185 186 /** 187 * The port. 188 * 189 * This is inferred from the scheme if it isn't present in the URL itself. 190 * If the scheme is not known and the port is not present, the port will be given as 0. 191 * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 192 * 193 * If you explicitly need to detect whether the user provided a port, check the providedPort 194 * field. 195 */ 196 @property ushort port() { 197 if (providedPort != 0) { 198 return providedPort; 199 } 200 if (auto p = scheme in schemeToDefaultPort) { 201 return *p; 202 } 203 return 0; 204 } 205 206 /** 207 * Set the port. 208 * 209 * This sets the providedPort field and is provided for convenience. 210 */ 211 @property ushort port(ushort value) { 212 return providedPort = value; 213 } 214 215 /// The port that was explicitly provided in the URL. 216 ushort providedPort; 217 218 /** 219 * The path. 220 * 221 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 222 * "/news/story/17774". 223 */ 224 string path; 225 226 /** 227 * Deprecated: this disallows multiple values for the same query string. Please use queryParams 228 * instead. 229 * 230 * The query string elements. 231 * 232 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string 233 * elements will be ["visited": "false"]. 234 * 235 * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be 236 * ["item": ""]. 237 * 238 * This field is mutable, so be cautious. 239 */ 240 deprecated("use queryParams") string[string] query; 241 242 /** 243 * The query parameters associated with this URL. 244 */ 245 QueryParams queryParams; 246 247 /** 248 * The fragment. In web documents, this typically refers to an anchor element. 249 * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 250 */ 251 string fragment; 252 253 /** 254 * Convert this URL to a string. 255 * The string is properly formatted and usable for, eg, a web request. 256 */ 257 string toString() { 258 return toString(false); 259 } 260 261 /** 262 * Convert this URL to a string. 263 * The string is intended to be human-readable rather than machine-readable. 264 */ 265 string toHumanReadableString() { 266 return toString(true); 267 } 268 269 private string toString(bool humanReadable) { 270 Appender!string s; 271 s ~= scheme; 272 s ~= "://"; 273 if (user) { 274 s ~= humanReadable ? user : user.percentEncode; 275 s ~= ":"; 276 s ~= humanReadable ? pass : pass.percentEncode; 277 s ~= "@"; 278 } 279 s ~= humanReadable ? host : host.toPuny; 280 if (providedPort) { 281 if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 282 s ~= ":"; 283 s ~= providedPort.to!string; 284 } 285 } 286 string p = path; 287 if (p.length == 0 || p == "/") { 288 s ~= '/'; 289 } else { 290 if (p[0] == '/') { 291 p = p[1..$]; 292 } 293 if (humanReadable) { 294 s ~= p; 295 } else { 296 foreach (part; p.split('/')) { 297 s ~= '/'; 298 s ~= part.percentEncode; 299 } 300 } 301 } 302 if (queryParams.length) { 303 bool first = true; 304 s ~= '?'; 305 foreach (k, v; queryParams) { 306 if (!first) { 307 s ~= '&'; 308 } 309 first = false; 310 s ~= k.percentEncode; 311 if (v.length > 0) { 312 s ~= '='; 313 s ~= v.percentEncode; 314 } 315 } 316 } else if (query) { 317 s ~= '?'; 318 bool first = true; 319 foreach (k, v; query) { 320 if (!first) { 321 s ~= '&'; 322 } 323 first = false; 324 s ~= k.percentEncode; 325 if (v.length > 0) { 326 s ~= '='; 327 s ~= v.percentEncode; 328 } 329 } 330 } 331 if (fragment) { 332 s ~= '#'; 333 s ~= fragment.percentEncode; 334 } 335 return s.data; 336 } 337 338 /// Implicitly convert URLs to strings. 339 alias toString this; 340 341 /** 342 * The append operator (~). 343 * 344 * The append operator for URLs returns a new URL with the given string appended as a path 345 * element to the URL's path. It only adds new path elements (or sequences of path elements). 346 * 347 * Don't worry about path separators; whether you include them or not, it will just work. 348 * 349 * Query elements are copied. 350 * 351 * Examples: 352 * --- 353 * auto random = "http://testdata.org/random".parseURL; 354 * auto randInt = random ~ "int"; 355 * writeln(randInt); // prints "http://testdata.org/random/int" 356 * --- 357 */ 358 URL opBinary(string op : "~")(string subsequentPath) { 359 URL other = this; 360 other ~= subsequentPath; 361 if (query) { 362 other.query = other.query.dup; 363 } 364 return other; 365 } 366 367 /** 368 * The append-in-place operator (~=). 369 * 370 * The append operator for URLs adds a path element to this URL. It only adds new path elements 371 * (or sequences of path elements). 372 * 373 * Don't worry about path separators; whether you include them or not, it will just work. 374 * 375 * Examples: 376 * --- 377 * auto random = "http://testdata.org/random".parseURL; 378 * random ~= "int"; 379 * writeln(random); // prints "http://testdata.org/random/int" 380 * --- 381 */ 382 URL opOpAssign(string op : "~")(string subsequentPath) { 383 if (path.endsWith("/")) { 384 if (subsequentPath.startsWith("/")) { 385 path ~= subsequentPath[1..$]; 386 } else { 387 path ~= subsequentPath; 388 } 389 } else { 390 if (!subsequentPath.startsWith("/")) { 391 path ~= '/'; 392 } 393 path ~= subsequentPath; 394 } 395 return this; 396 } 397 } 398 399 /** 400 * Parse a URL from a string. 401 * 402 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 403 * may be made. However, any URL in a correct format will be parsed correctly. 404 */ 405 bool tryParseURL(string value, out URL url) { 406 url = URL.init; 407 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 408 // Scheme is optional in common use. We infer 'http' if it's not given. 409 auto i = value.indexOf("//"); 410 if (i > -1) { 411 if (i > 1) { 412 url.scheme = value[0..i-1]; 413 } 414 value = value[i+2 .. $]; 415 } else { 416 url.scheme = "http"; 417 } 418 // [user:password@]host[:port]][/]path[?query][#fragment 419 i = value.indexOfAny([':', '/']); 420 if (i == -1) { 421 // Just a hostname. 422 url.host = value.fromPuny; 423 return true; 424 } 425 426 if (value[i] == ':') { 427 // This could be between username and password, or it could be between host and port. 428 auto j = value.indexOfAny(['@', '/']); 429 if (j > -1 && value[j] == '@') { 430 try { 431 url.user = value[0..i].percentDecode; 432 url.pass = value[i+1 .. j].percentDecode; 433 } catch (URLException) { 434 return false; 435 } 436 value = value[j+1 .. $]; 437 } 438 } 439 440 // It's trying to be a host/port, not a user/pass. 441 i = value.indexOfAny([':', '/']); 442 if (i == -1) { 443 url.host = value.fromPuny; 444 return true; 445 } 446 url.host = value[0..i].fromPuny; 447 value = value[i .. $]; 448 if (value[0] == ':') { 449 auto end = value.indexOf('/'); 450 if (end == -1) { 451 end = value.length; 452 } 453 try { 454 url.port = value[1 .. end].to!ushort; 455 } catch (ConvException) { 456 return false; 457 } 458 value = value[end .. $]; 459 if (value.length == 0) { 460 return true; 461 } 462 } 463 464 i = value.indexOfAny("?#"); 465 if (i == -1) { 466 url.path = value.percentDecode; 467 return true; 468 } 469 470 try { 471 url.path = value[0..i].percentDecode; 472 } catch (URLException) { 473 return false; 474 } 475 auto c = value[i]; 476 value = value[i + 1 .. $]; 477 if (c == '?') { 478 i = value.indexOf('#'); 479 string query; 480 if (i < 0) { 481 query = value; 482 value = null; 483 } else { 484 query = value[0..i]; 485 value = value[i + 1 .. $]; 486 } 487 auto queries = query.split('&'); 488 foreach (q; queries) { 489 auto j = q.indexOf('='); 490 string key, val; 491 if (j < 0) { 492 key = q; 493 } else { 494 key = q[0..j]; 495 val = q[j + 1 .. $]; 496 } 497 try { 498 key = key.percentDecode; 499 val = val.percentDecode; 500 } catch (URLException) { 501 return false; 502 } 503 url.query[key] = val; 504 url.queryParams.add(key, val); 505 } 506 } 507 508 try { 509 url.fragment = value.percentDecode; 510 } catch (URLException) { 511 return false; 512 } 513 514 return true; 515 } 516 517 unittest { 518 { 519 // Basic. 520 URL url; 521 with (url) { 522 scheme = "https"; 523 host = "example.org"; 524 path = "/foo/bar"; 525 query["hello"] = "world"; 526 query["gibe"] = "clay"; 527 fragment = "frag"; 528 } 529 assert( 530 // Not sure what order it'll come out in. 531 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 532 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 533 url.toString); 534 } 535 { 536 // Percent encoded. 537 URL url; 538 with (url) { 539 scheme = "https"; 540 host = "example.org"; 541 path = "/f☃o"; 542 query["❄"] = "❀"; 543 query["["] = "]"; 544 fragment = "ş"; 545 } 546 assert( 547 // Not sure what order it'll come out in. 548 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 549 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 550 url.toString); 551 } 552 { 553 // Port, user, pass. 554 URL url; 555 with (url) { 556 scheme = "https"; 557 host = "example.org"; 558 user = "dhasenan"; 559 pass = "itsasecret"; 560 port = 17; 561 } 562 assert( 563 url.toString == "https://dhasenan:itsasecret@example.org:17/", 564 url.toString); 565 } 566 { 567 // Query with no path. 568 URL url; 569 with (url) { 570 scheme = "https"; 571 host = "example.org"; 572 query["hi"] = "bye"; 573 } 574 assert( 575 url.toString == "https://example.org/?hi=bye", 576 url.toString); 577 } 578 } 579 580 unittest 581 { 582 auto url = "//foo/bar".parseURL; 583 assert(url.host == "foo", "expected host foo, got " ~ url.host); 584 assert(url.path == "/bar"); 585 } 586 587 unittest 588 { 589 auto url = "localhost:5984".parseURL; 590 auto url2 = url ~ "db1"; 591 assert(url2.toString == "http://localhost:5984/db1", url2.toString); 592 auto url3 = url2 ~ "_all_docs"; 593 assert(url3.toString == "http://localhost:5984/db1/_all_docs", url3.toString); 594 } 595 596 /// 597 unittest { 598 { 599 // Basic. 600 URL url; 601 with (url) { 602 scheme = "https"; 603 host = "example.org"; 604 path = "/foo/bar"; 605 queryParams.add("hello", "world"); 606 queryParams.add("gibe", "clay"); 607 fragment = "frag"; 608 } 609 assert( 610 // Not sure what order it'll come out in. 611 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 612 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 613 url.toString); 614 } 615 { 616 // Passing an array of query values. 617 URL url; 618 with (url) { 619 scheme = "https"; 620 host = "example.org"; 621 path = "/foo/bar"; 622 queryParams.add("hello", "world"); 623 queryParams.add("hello", "aether"); 624 fragment = "frag"; 625 } 626 assert( 627 // Not sure what order it'll come out in. 628 url.toString == "https://example.org/foo/bar?hello=world&hello=aether#frag" || 629 url.toString == "https://example.org/foo/bar?hello=aether&hello=world#frag", 630 url.toString); 631 } 632 { 633 // Percent encoded. 634 URL url; 635 with (url) { 636 scheme = "https"; 637 host = "example.org"; 638 path = "/f☃o"; 639 queryParams.add("❄", "❀"); 640 queryParams.add("[", "]"); 641 fragment = "ş"; 642 } 643 assert( 644 // Not sure what order it'll come out in. 645 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 646 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 647 url.toString); 648 } 649 { 650 // Port, user, pass. 651 URL url; 652 with (url) { 653 scheme = "https"; 654 host = "example.org"; 655 user = "dhasenan"; 656 pass = "itsasecret"; 657 port = 17; 658 } 659 assert( 660 url.toString == "https://dhasenan:itsasecret@example.org:17/", 661 url.toString); 662 } 663 { 664 // Query with no path. 665 URL url; 666 with (url) { 667 scheme = "https"; 668 host = "example.org"; 669 queryParams.add("hi", "bye"); 670 } 671 assert( 672 url.toString == "https://example.org/?hi=bye", 673 url.toString); 674 } 675 } 676 677 unittest { 678 // Percent decoding. 679 680 // http://#:!:@ 681 auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash"; 682 auto url = urlString.parseURL; 683 assert(url.user == "#"); 684 assert(url.pass == "!:"); 685 assert(url.host == "example.org"); 686 assert(url.path == "/{/}"); 687 assert(url.queryParams[";"].front == ""); 688 assert(url.queryParams["&"].front == "="); 689 assert(url.fragment == "#hash"); 690 691 // Round trip. 692 assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 693 assert(urlString == urlString.parseURL.toString.parseURL.toString); 694 } 695 696 unittest { 697 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 698 assert(url.host == "☂.☃.org", url.host); 699 } 700 701 unittest { 702 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 703 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 704 assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 705 } 706 707 unittest { 708 auto url = "https://☂.☃.org/?hi=bye".parseURL; 709 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 710 } 711 712 /// 713 unittest { 714 // There's an existing path. 715 auto url = parseURL("http://example.org/foo"); 716 URL url2; 717 // No slash? Assume it needs a slash. 718 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 719 // With slash? Don't add another. 720 url2 = url ~ "/bar"; 721 assert(url2.toString == "http://example.org/foo/bar", url2.toString); 722 url ~= "bar"; 723 assert(url.toString == "http://example.org/foo/bar"); 724 725 // Path already ends with a slash; don't add another. 726 url = parseURL("http://example.org/foo/"); 727 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 728 // Still don't add one even if you're appending with a slash. 729 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 730 url ~= "/bar"; 731 assert(url.toString == "http://example.org/foo/bar"); 732 733 // No path. 734 url = parseURL("http://example.org"); 735 assert((url ~ "bar").toString == "http://example.org/bar"); 736 assert((url ~ "/bar").toString == "http://example.org/bar"); 737 url ~= "bar"; 738 assert(url.toString == "http://example.org/bar"); 739 740 // Path is just a slash. 741 url = parseURL("http://example.org/"); 742 assert((url ~ "bar").toString == "http://example.org/bar"); 743 assert((url ~ "/bar").toString == "http://example.org/bar"); 744 url ~= "bar"; 745 assert(url.toString == "http://example.org/bar", url.toString); 746 747 // No path, just fragment. 748 url = "ircs://irc.freenode.com/#d".parseURL; 749 assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 750 } 751 752 unittest { 753 import std.net.curl; 754 auto url = "http://example.org".parseURL; 755 assert(is(typeof(std.net.curl.get(url)))); 756 } 757 758 /** 759 * Parse the input string as a URL. 760 * 761 * Throws: 762 * URLException if the string was in an incorrect format. 763 */ 764 URL parseURL(string value) { 765 URL url; 766 if (tryParseURL(value, url)) { 767 return url; 768 } 769 throw new URLException("failed to parse URL " ~ value); 770 } 771 772 /// 773 unittest { 774 { 775 // Infer scheme 776 auto u1 = parseURL("example.org"); 777 assert(u1.scheme == "http"); 778 assert(u1.host == "example.org"); 779 assert(u1.path == ""); 780 assert(u1.port == 80); 781 assert(u1.providedPort == 0); 782 assert(u1.fragment == ""); 783 } 784 { 785 // Simple host and scheme 786 auto u1 = parseURL("https://example.org"); 787 assert(u1.scheme == "https"); 788 assert(u1.host == "example.org"); 789 assert(u1.path == ""); 790 assert(u1.port == 443); 791 assert(u1.providedPort == 0); 792 } 793 { 794 // With path 795 auto u1 = parseURL("https://example.org/foo/bar"); 796 assert(u1.scheme == "https"); 797 assert(u1.host == "example.org"); 798 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 799 assert(u1.port == 443); 800 assert(u1.providedPort == 0); 801 } 802 { 803 // With explicit port 804 auto u1 = parseURL("https://example.org:1021/foo/bar"); 805 assert(u1.scheme == "https"); 806 assert(u1.host == "example.org"); 807 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 808 assert(u1.port == 1021); 809 assert(u1.providedPort == 1021); 810 } 811 { 812 // With user 813 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 814 assert(u1.scheme == "https"); 815 assert(u1.host == "example.org"); 816 assert(u1.path == "/foo/bar"); 817 assert(u1.port == 443); 818 assert(u1.user == "bob"); 819 assert(u1.pass == "secret"); 820 } 821 { 822 // With user, URL-encoded 823 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 824 assert(u1.scheme == "https"); 825 assert(u1.host == "example.org"); 826 assert(u1.path == "/foo/bar"); 827 assert(u1.port == 443); 828 assert(u1.user == "bob!"); 829 assert(u1.pass == "secret!?"); 830 } 831 { 832 // With user and port and path 833 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 834 assert(u1.scheme == "https"); 835 assert(u1.host == "example.org"); 836 assert(u1.path == "/foo/bar"); 837 assert(u1.port == 2210); 838 assert(u1.user == "bob"); 839 assert(u1.pass == "secret"); 840 assert(u1.fragment == ""); 841 } 842 { 843 // With query string 844 auto u1 = parseURL("https://example.org/?login=true"); 845 assert(u1.scheme == "https"); 846 assert(u1.host == "example.org"); 847 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 848 assert(u1.queryParams["login"].front == "true"); 849 assert(u1.fragment == ""); 850 } 851 { 852 // With query string and fragment 853 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 854 assert(u1.scheme == "https"); 855 assert(u1.host == "example.org"); 856 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 857 assert(u1.queryParams["login"].front == "true"); 858 assert(u1.fragment == "justkidding"); 859 } 860 { 861 // With URL-encoded values 862 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 863 assert(u1.scheme == "https"); 864 assert(u1.host == "example.org"); 865 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 866 assert(u1.queryParams["❄"].front == "="); 867 assert(u1.fragment == "^"); 868 } 869 } 870 871 unittest { 872 assert(parseURL("http://example.org").port == 80); 873 assert(parseURL("http://example.org:5326").port == 5326); 874 875 auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 876 assert(url.scheme == "redis"); 877 assert(url.user == "admin"); 878 assert(url.pass == "password"); 879 880 assert(parseURL("example.org").toString == "http://example.org/"); 881 assert(parseURL("http://example.org:80").toString == "http://example.org/"); 882 883 assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 884 } 885 886 /** 887 * Percent-encode a string. 888 * 889 * URL components cannot contain non-ASCII characters, and there are very few characters that are 890 * safe to include as URL components. Domain names using Unicode values use Punycode. For 891 * everything else, there is percent encoding. 892 */ 893 string percentEncode(string raw) { 894 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 895 // We *can* encode any other characters. 896 // We *should not* encode alpha, numeric, or -._~. 897 Appender!string app; 898 foreach (dchar d; raw) { 899 if (('a' <= d && 'z' >= d) || 900 ('A' <= d && 'Z' >= d) || 901 ('0' <= d && '9' >= d) || 902 d == '-' || d == '.' || d == '_' || d == '~') { 903 app ~= d; 904 continue; 905 } 906 // Something simple like a space character? Still in 7-bit ASCII? 907 // Then we get a single-character string out of it and just encode 908 // that one bit. 909 // Something not in 7-bit ASCII? Then we percent-encode each octet 910 // in the UTF-8 encoding (and hope the server understands UTF-8). 911 char[] c; 912 encode(c, d); 913 auto bytes = cast(ubyte[])c; 914 foreach (b; bytes) { 915 app ~= format("%%%02X", b); 916 } 917 } 918 return cast(string)app.data; 919 } 920 921 /// 922 unittest { 923 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 924 assert(percentEncode("~~--..__") == "~~--..__"); 925 assert(percentEncode("0123456789") == "0123456789"); 926 927 string e; 928 929 e = percentEncode("☃"); 930 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 931 } 932 933 /** 934 * Percent-decode a string. 935 * 936 * URL components cannot contain non-ASCII characters, and there are very few characters that are 937 * safe to include as URL components. Domain names using Unicode values use Punycode. For 938 * everything else, there is percent encoding. 939 * 940 * This explicitly ensures that the result is a valid UTF-8 string. 941 */ 942 @trusted string percentDecode(string encoded) { 943 ubyte[] raw = percentDecodeRaw(encoded); 944 // This cast is not considered @safe because it converts from one pointer type to another. 945 // However, it's 1-byte values in either case, no reference types, so this won't result in any 946 // memory safety errors. We also check for validity immediately. 947 auto s = cast(string) raw; 948 if (!s.isValid) { 949 // TODO(dhasenan): 950 throw new URLException("input contains invalid UTF data"); 951 } 952 return s; 953 } 954 955 /// 956 unittest { 957 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 958 assert(percentDecode("~~--..__") == "~~--..__"); 959 assert(percentDecode("0123456789") == "0123456789"); 960 961 string e; 962 963 e = percentDecode("%E2%98%83"); 964 assert(e == "☃", "expected a snowman but got" ~ e); 965 } 966 967 /** 968 * Percent-decode a string into a ubyte array. 969 * 970 * URL components cannot contain non-ASCII characters, and there are very few characters that are 971 * safe to include as URL components. Domain names using Unicode values use Punycode. For 972 * everything else, there is percent encoding. 973 * 974 * This yields a ubyte array and will not perform validation on the output. However, an improperly 975 * formatted input string will result in a URLException. 976 */ 977 ubyte[] percentDecodeRaw(string encoded) { 978 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 979 Appender!(ubyte[]) app; 980 for (int i = 0; i < encoded.length; i++) { 981 if (encoded[i] != '%') { 982 app ~= encoded[i]; 983 continue; 984 } 985 if (i >= encoded.length - 2) { 986 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 987 "percent symbol. Error at index " ~ i.to!string); 988 } 989 auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1])); 990 auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2])); 991 app ~= cast(ubyte)((b << 4) | c); 992 i += 2; 993 } 994 return app.data; 995 } 996 997 private string toPuny(string unicodeHostname) { 998 bool mustEncode = false; 999 foreach (i, dchar d; unicodeHostname) { 1000 auto c = cast(uint) d; 1001 if (c > 0x80) { 1002 mustEncode = true; 1003 break; 1004 } 1005 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 1006 throw new URLException( 1007 format( 1008 "domain name '%s' contains illegal character '%s' at position %s", 1009 unicodeHostname, d, i)); 1010 } 1011 } 1012 if (!mustEncode) { 1013 return unicodeHostname; 1014 } 1015 return unicodeHostname.split('.').map!punyEncode.join("."); 1016 } 1017 1018 private string fromPuny(string hostname) { 1019 return hostname.split('.').map!punyDecode.join("."); 1020 } 1021 1022 private { 1023 enum delimiter = '-'; 1024 enum marker = "xn--"; 1025 enum ulong damp = 700; 1026 enum ulong tmin = 1; 1027 enum ulong tmax = 26; 1028 enum ulong skew = 38; 1029 enum ulong base = 36; 1030 enum ulong initialBias = 72; 1031 enum dchar initialN = cast(dchar)128; 1032 1033 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 1034 if (firstTime) { 1035 delta /= damp; 1036 } else { 1037 delta /= 2; 1038 } 1039 delta += delta / numPoints; 1040 ulong k = 0; 1041 while (delta > ((base - tmin) * tmax) / 2) { 1042 delta /= (base - tmin); 1043 k += base; 1044 } 1045 return k + (((base - tmin + 1) * delta) / (delta + skew)); 1046 } 1047 } 1048 1049 /** 1050 * Encode the input string using the Punycode algorithm. 1051 * 1052 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1053 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1054 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1055 * 1056 * In order to puny-encode a domain name, you must split it into its components. The following will 1057 * typically suffice: 1058 * --- 1059 * auto domain = "☂.☃.com"; 1060 * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1061 * --- 1062 */ 1063 string punyEncode(string input) { 1064 ulong delta = 0; 1065 dchar n = initialN; 1066 auto i = 0; 1067 auto bias = initialBias; 1068 Appender!string output; 1069 output ~= marker; 1070 auto pushed = 0; 1071 auto codePoints = 0; 1072 foreach (dchar c; input) { 1073 codePoints++; 1074 if (c <= initialN) { 1075 output ~= c; 1076 pushed++; 1077 } 1078 } 1079 if (pushed < codePoints) { 1080 if (pushed > 0) { 1081 output ~= delimiter; 1082 } 1083 } else { 1084 // No encoding to do. 1085 return input; 1086 } 1087 bool first = true; 1088 while (pushed < codePoints) { 1089 auto best = dchar.max; 1090 foreach (dchar c; input) { 1091 if (n <= c && c < best) { 1092 best = c; 1093 } 1094 } 1095 if (best == dchar.max) { 1096 throw new URLException("failed to find a new codepoint to process during punyencode"); 1097 } 1098 delta += (best - n) * (pushed + 1); 1099 if (delta > uint.max) { 1100 // TODO better error message 1101 throw new URLException("overflow during punyencode"); 1102 } 1103 n = best; 1104 foreach (dchar c; input) { 1105 if (c < n) { 1106 delta++; 1107 } 1108 if (c == n) { 1109 ulong q = delta; 1110 auto k = base; 1111 while (true) { 1112 ulong t; 1113 if (k <= bias) { 1114 t = tmin; 1115 } else if (k >= bias + tmax) { 1116 t = tmax; 1117 } else { 1118 t = k - bias; 1119 } 1120 if (q < t) { 1121 break; 1122 } 1123 output ~= digitToBasic(t + ((q - t) % (base - t))); 1124 q = (q - t) / (base - t); 1125 k += base; 1126 } 1127 output ~= digitToBasic(q); 1128 pushed++; 1129 bias = adapt(delta, pushed, first); 1130 first = false; 1131 delta = 0; 1132 } 1133 } 1134 delta++; 1135 n++; 1136 } 1137 return cast(string)output.data; 1138 } 1139 1140 /** 1141 * Decode the input string using the Punycode algorithm. 1142 * 1143 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1144 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1145 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1146 * 1147 * In order to puny-decode a domain name, you must split it into its components. The following will 1148 * typically suffice: 1149 * --- 1150 * auto domain = "xn--m3h.xn--n3h.com"; 1151 * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1152 * --- 1153 */ 1154 string punyDecode(string input) { 1155 if (!input.startsWith(marker)) { 1156 return input; 1157 } 1158 input = input[marker.length..$]; 1159 1160 // let n = initial_n 1161 dchar n = cast(dchar)128; 1162 1163 // let i = 0 1164 // let bias = initial_bias 1165 // let output = an empty string indexed from 0 1166 ulong i = 0; 1167 auto bias = initialBias; 1168 dchar[] output; 1169 // This reserves a bit more than necessary, but it should be more efficient overall than just 1170 // appending and inserting volo-nolo. 1171 output.reserve(input.length); 1172 1173 // consume all code points before the last delimiter (if there is one) 1174 // and copy them to output, fail on any non-basic code point 1175 // if more than zero code points were consumed then consume one more 1176 // (which will be the last delimiter) 1177 auto end = input.lastIndexOf(delimiter); 1178 if (end > -1) { 1179 foreach (dchar c; input[0..end]) { 1180 output ~= c; 1181 } 1182 input = input[end+1 .. $]; 1183 } 1184 1185 // while the input is not exhausted do begin 1186 ulong pos = 0; 1187 while (pos < input.length) { 1188 // let oldi = i 1189 // let w = 1 1190 auto oldi = i; 1191 auto w = 1; 1192 // for k = base to infinity in steps of base do begin 1193 for (ulong k = base; k < uint.max; k += base) { 1194 // consume a code point, or fail if there was none to consume 1195 // Note that the input is all ASCII, so we can simply index the input string bytewise. 1196 auto c = input[pos]; 1197 pos++; 1198 // let digit = the code point's digit-value, fail if it has none 1199 auto digit = basicToDigit(c); 1200 // let i = i + digit * w, fail on overflow 1201 i += digit * w; 1202 // let t = tmin if k <= bias {+ tmin}, or 1203 // tmax if k >= bias + tmax, or k - bias otherwise 1204 ulong t; 1205 if (k <= bias) { 1206 t = tmin; 1207 } else if (k >= bias + tmax) { 1208 t = tmax; 1209 } else { 1210 t = k - bias; 1211 } 1212 // if digit < t then break 1213 if (digit < t) { 1214 break; 1215 } 1216 // let w = w * (base - t), fail on overflow 1217 w *= (base - t); 1218 // end 1219 } 1220 // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1221 bias = adapt(i - oldi, output.length + 1, oldi == 0); 1222 // let n = n + i div (length(output) + 1), fail on overflow 1223 n += i / (output.length + 1); 1224 // let i = i mod (length(output) + 1) 1225 i %= (output.length + 1); 1226 // {if n is a basic code point then fail} 1227 // (We aren't actually going to fail here; it's clear what this means.) 1228 // insert n into output at position i 1229 (() @trusted { output.insertInPlace(i, cast(dchar)n); })(); // should be @safe but isn't marked 1230 // increment i 1231 i++; 1232 // end 1233 } 1234 return output.to!string; 1235 } 1236 1237 // Lifted from punycode.js. 1238 private dchar digitToBasic(ulong digit) { 1239 return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1240 } 1241 1242 // Lifted from punycode.js. 1243 private uint basicToDigit(char c) { 1244 auto codePoint = cast(uint)c; 1245 if (codePoint - 48 < 10) { 1246 return codePoint - 22; 1247 } 1248 if (codePoint - 65 < 26) { 1249 return codePoint - 65; 1250 } 1251 if (codePoint - 97 < 26) { 1252 return codePoint - 97; 1253 } 1254 return base; 1255 } 1256 1257 unittest { 1258 { 1259 auto a = "b\u00FCcher"; 1260 assert(punyEncode(a) == "xn--bcher-kva"); 1261 } 1262 { 1263 auto a = "b\u00FCc\u00FCher"; 1264 assert(punyEncode(a) == "xn--bcher-kvab"); 1265 } 1266 { 1267 auto a = "ýbücher"; 1268 auto b = punyEncode(a); 1269 assert(b == "xn--bcher-kvaf", b); 1270 } 1271 1272 { 1273 auto a = "mañana"; 1274 assert(punyEncode(a) == "xn--maana-pta"); 1275 } 1276 1277 { 1278 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1279 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1280 auto b = punyEncode(a); 1281 assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1282 } 1283 import std.stdio; 1284 } 1285 1286 unittest { 1287 { 1288 auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1289 assert(b == "ليهمابتكلموشعربي؟", b); 1290 } 1291 { 1292 assert(punyDecode("xn--maana-pta") == "mañana"); 1293 } 1294 } 1295 1296 unittest { 1297 import std.string, std.algorithm, std.array, std.range; 1298 { 1299 auto domain = "xn--m3h.xn--n3h.com"; 1300 auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1301 assert(decodedDomain == "☂.☃.com", decodedDomain); 1302 } 1303 { 1304 auto domain = "☂.☃.com"; 1305 auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1306 assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1307 } 1308 }