1 /** 2 * A URL handling library. 3 * 4 * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 * elements like port, path, username, and password. 6 * 7 * This module aims to make it simple to muck about with them. 8 * 9 * Example usage: 10 * --- 11 * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 * auto files = system("ssh", url.toString, "ls").splitLines; 13 * foreach (file; files) { 14 * system("scp", url ~ file, "."); 15 * } 16 * --- 17 * 18 * License: The MIT license. 19 */ 20 module url; 21 22 import std.algorithm; 23 import std.array; 24 import std.conv; 25 import std.encoding; 26 import std.string; 27 import std.utf; 28 29 @safe: 30 31 /// An exception thrown when something bad happens with URLs. 32 class URLException : Exception { 33 this(string msg) { super(msg); } 34 } 35 36 /** 37 * A mapping from schemes to their default ports. 38 * 39 * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 40 * use even if they use ports. Entries here should be treated as best guesses. 41 */ 42 ushort[string] schemeToDefaultPort; 43 44 static this() { 45 schemeToDefaultPort = [ 46 "aaa": 3868, 47 "aaas": 5658, 48 "acap": 674, 49 "amqp": 5672, 50 "cap": 1026, 51 "coap": 5683, 52 "coaps": 5684, 53 "dav": 443, 54 "dict": 2628, 55 "ftp": 21, 56 "git": 9418, 57 "go": 1096, 58 "gopher": 70, 59 "http": 80, 60 "https": 443, 61 "iac": 4569, 62 "icap": 1344, 63 "imap": 143, 64 "ipp": 631, 65 "ipps": 631, // yes, they're both mapped to port 631 66 "irc": 6667, // De facto default port, not the IANA reserved port. 67 "ircs": 6697, 68 "iris": 702, // defaults to iris.beep 69 "iris.beep": 702, 70 "iris.lwz": 715, 71 "iris.xpc": 713, 72 "iris.xpcs": 714, 73 "jabber": 5222, // client-to-server 74 "ldap": 389, 75 "ldaps": 636, 76 "msrp": 2855, 77 "msrps": 2855, 78 "mtqp": 1038, 79 "mupdate": 3905, 80 "news": 119, 81 "nfs": 2049, 82 "pop": 110, 83 "redis": 6379, 84 "reload": 6084, 85 "rsync": 873, 86 "rtmfp": 1935, 87 "rtsp": 554, 88 "shttp": 80, 89 "sieve": 4190, 90 "sip": 5060, 91 "sips": 5061, 92 "smb": 445, 93 "smtp": 25, 94 "snews": 563, 95 "snmp": 161, 96 "soap.beep": 605, 97 "ssh": 22, 98 "stun": 3478, 99 "stuns": 5349, 100 "svn": 3690, 101 "teamspeak": 9987, 102 "telnet": 23, 103 "tftp": 69, 104 "tip": 3372, 105 ]; 106 } 107 108 /** 109 * A collection of query parameters. 110 * 111 * This is effectively a multimap of string -> strings. 112 */ 113 struct QueryParams { 114 import std.typecons; 115 alias Tuple!(string, "key", string, "value") Param; 116 Param[] params; 117 118 @property size_t length() { 119 return params.length; 120 } 121 122 /// Get a range over the query parameter values for the given key. 123 auto opIndex(string key) { 124 return params.find!(x => x.key == key).map!(x => x.value); 125 } 126 127 /// Add a query parameter with the given key and value. 128 /// If one already exists, there will now be two query parameters with the given name. 129 void add(string key, string value) { 130 params ~= Param(key, value); 131 } 132 133 /// Add a query parameter with the given key and value. 134 /// If there are any existing parameters with the same key, they are removed and overwritten. 135 void overwrite(string key, string value) { 136 for (int i = 0; i < params.length; i++) { 137 if (params[i].key == key) { 138 params[i] = params[$-1]; 139 params.length--; 140 } 141 } 142 params ~= Param(key, value); 143 } 144 145 private struct QueryParamRange { 146 size_t i; 147 const(Param)[] params; 148 bool empty() { return i >= params.length; } 149 void popFront() { i++; } 150 Param front() { return params[i]; } 151 } 152 153 /** 154 * A range over the query parameters. 155 * 156 * Usage: 157 * --- 158 * foreach (key, value; url.queryParams) {} 159 * --- 160 */ 161 auto range() { 162 return QueryParamRange(0, this.params); 163 } 164 /// ditto 165 alias range this; 166 } 167 168 /** 169 * A Unique Resource Locator. 170 * 171 * URLs can be parsed (see parseURL) and implicitly convert to strings. 172 */ 173 struct URL { 174 /// The URL scheme. For instance, ssh, ftp, or https. 175 string scheme; 176 177 /// The username in this URL. Usually absent. If present, there will also be a password. 178 string user; 179 180 /// The password in this URL. Usually absent. 181 string pass; 182 183 /// The hostname. 184 string host; 185 186 /** 187 * The port. 188 * 189 * This is inferred from the scheme if it isn't present in the URL itself. 190 * If the scheme is not known and the port is not present, the port will be given as 0. 191 * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 192 * 193 * If you explicitly need to detect whether the user provided a port, check the providedPort 194 * field. 195 */ 196 @property ushort port() { 197 if (providedPort != 0) { 198 return providedPort; 199 } 200 if (auto p = scheme in schemeToDefaultPort) { 201 return *p; 202 } 203 return 0; 204 } 205 206 /** 207 * Set the port. 208 * 209 * This sets the providedPort field and is provided for convenience. 210 */ 211 @property ushort port(ushort value) { 212 return providedPort = value; 213 } 214 215 /// The port that was explicitly provided in the URL. 216 ushort providedPort; 217 218 /** 219 * The path. 220 * 221 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 222 * "/news/story/17774". 223 */ 224 string path; 225 226 /** 227 * Deprecated: this disallows multiple values for the same query string. Please use queryParams 228 * instead. 229 * 230 * The query string elements. 231 * 232 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string 233 * elements will be ["visited": "false"]. 234 * 235 * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be 236 * ["item": ""]. 237 * 238 * This field is mutable, so be cautious. 239 */ 240 deprecated("use queryParams") string[string] query; 241 242 /** 243 * The query parameters associated with this URL. 244 */ 245 QueryParams queryParams; 246 247 /** 248 * The fragment. In web documents, this typically refers to an anchor element. 249 * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 250 */ 251 string fragment; 252 253 /** 254 * Convert this URL to a string. 255 * The string is properly formatted and usable for, eg, a web request. 256 */ 257 string toString() { 258 return toString(false); 259 } 260 261 /** 262 * Convert this URL to a string. 263 * The string is intended to be human-readable rather than machine-readable. 264 */ 265 string toHumanReadableString() { 266 return toString(true); 267 } 268 269 private string toString(bool humanReadable) { 270 Appender!string s; 271 s ~= scheme; 272 s ~= "://"; 273 if (user) { 274 s ~= humanReadable ? user : user.percentEncode; 275 s ~= ":"; 276 s ~= humanReadable ? pass : pass.percentEncode; 277 s ~= "@"; 278 } 279 s ~= humanReadable ? host : host.toPuny; 280 if (providedPort) { 281 if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 282 s ~= ":"; 283 s ~= providedPort.to!string; 284 } 285 } 286 string p = path; 287 if (p.length == 0 || p == "/") { 288 s ~= '/'; 289 } else { 290 if (p[0] == '/') { 291 p = p[1..$]; 292 } 293 if (humanReadable) { 294 s ~= p; 295 } else { 296 foreach (part; p.split('/')) { 297 s ~= '/'; 298 s ~= part.percentEncode; 299 } 300 } 301 } 302 if (queryParams.length) { 303 bool first = true; 304 s ~= '?'; 305 foreach (k, v; queryParams) { 306 if (!first) { 307 s ~= '&'; 308 } 309 first = false; 310 s ~= k.percentEncode; 311 if (v.length > 0) { 312 s ~= '='; 313 s ~= v.percentEncode; 314 } 315 } 316 } else if (query) { 317 s ~= '?'; 318 bool first = true; 319 foreach (k, v; query) { 320 if (!first) { 321 s ~= '&'; 322 } 323 first = false; 324 s ~= k.percentEncode; 325 if (v.length > 0) { 326 s ~= '='; 327 s ~= v.percentEncode; 328 } 329 } 330 } 331 if (fragment) { 332 s ~= '#'; 333 s ~= fragment.percentEncode; 334 } 335 return s.data; 336 } 337 338 /// Implicitly convert URLs to strings. 339 alias toString this; 340 341 /** 342 * The append operator (~). 343 * 344 * The append operator for URLs returns a new URL with the given string appended as a path 345 * element to the URL's path. It only adds new path elements (or sequences of path elements). 346 * 347 * Don't worry about path separators; whether you include them or not, it will just work. 348 * 349 * Query elements are copied. 350 * 351 * Examples: 352 * --- 353 * auto random = "http://testdata.org/random".parseURL; 354 * auto randInt = random ~ "int"; 355 * writeln(randInt); // prints "http://testdata.org/random/int" 356 * --- 357 */ 358 URL opBinary(string op : "~")(string subsequentPath) { 359 URL other = this; 360 other ~= subsequentPath; 361 if (query) { 362 other.query = other.query.dup; 363 } 364 return other; 365 } 366 367 /** 368 * The append-in-place operator (~=). 369 * 370 * The append operator for URLs adds a path element to this URL. It only adds new path elements 371 * (or sequences of path elements). 372 * 373 * Don't worry about path separators; whether you include them or not, it will just work. 374 * 375 * Examples: 376 * --- 377 * auto random = "http://testdata.org/random".parseURL; 378 * random ~= "int"; 379 * writeln(random); // prints "http://testdata.org/random/int" 380 * --- 381 */ 382 URL opOpAssign(string op : "~")(string subsequentPath) { 383 if (path.endsWith("/") || subsequentPath.startsWith("/")) { 384 if (path.endsWith("/") && subsequentPath.startsWith("/")) { 385 path ~= subsequentPath[1..$]; 386 } else { 387 path ~= subsequentPath; 388 } 389 } else { 390 path ~= '/'; 391 path ~= subsequentPath; 392 } 393 return this; 394 } 395 } 396 397 /** 398 * Parse a URL from a string. 399 * 400 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 401 * may be made. However, any URL in a correct format will be parsed correctly. 402 */ 403 bool tryParseURL(string value, out URL url) { 404 url = URL.init; 405 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 406 // Scheme is optional in common use. We infer 'http' if it's not given. 407 auto i = value.indexOf("://"); 408 if (i > -1) { 409 url.scheme = value[0..i]; 410 value = value[i+3 .. $]; 411 } else { 412 url.scheme = "http"; 413 } 414 // [user:password@]host[:port]][/]path[?query][#fragment 415 i = value.indexOfAny([':', '/']); 416 if (i == -1) { 417 // Just a hostname. 418 url.host = value.fromPuny; 419 return true; 420 } 421 422 if (value[i] == ':') { 423 // This could be between username and password, or it could be between host and port. 424 auto j = value.indexOfAny(['@', '/']); 425 if (j > -1 && value[j] == '@') { 426 try { 427 url.user = value[0..i].percentDecode; 428 url.pass = value[i+1 .. j].percentDecode; 429 } catch (URLException) { 430 return false; 431 } 432 value = value[j+1 .. $]; 433 } 434 } 435 436 // It's trying to be a host/port, not a user/pass. 437 i = value.indexOfAny([':', '/']); 438 if (i == -1) { 439 url.host = value.fromPuny; 440 return true; 441 } 442 url.host = value[0..i].fromPuny; 443 value = value[i .. $]; 444 if (value[0] == ':') { 445 auto end = value.indexOf('/'); 446 if (end == -1) { 447 end = value.length; 448 } 449 try { 450 url.port = value[1 .. end].to!ushort; 451 } catch (ConvException) { 452 return false; 453 } 454 value = value[end .. $]; 455 if (value.length == 0) { 456 return true; 457 } 458 } 459 460 i = value.indexOfAny("?#"); 461 if (i == -1) { 462 url.path = value.percentDecode; 463 return true; 464 } 465 466 try { 467 url.path = value[0..i].percentDecode; 468 } catch (URLException) { 469 return false; 470 } 471 auto c = value[i]; 472 value = value[i + 1 .. $]; 473 if (c == '?') { 474 i = value.indexOf('#'); 475 string query; 476 if (i < 0) { 477 query = value; 478 value = null; 479 } else { 480 query = value[0..i]; 481 value = value[i + 1 .. $]; 482 } 483 auto queries = query.split('&'); 484 foreach (q; queries) { 485 auto j = q.indexOf('='); 486 string key, val; 487 if (j < 0) { 488 key = q; 489 } else { 490 key = q[0..j]; 491 val = q[j + 1 .. $]; 492 } 493 try { 494 key = key.percentDecode; 495 val = val.percentDecode; 496 } catch (URLException) { 497 return false; 498 } 499 url.query[key] = val; 500 url.queryParams.add(key, val); 501 } 502 } 503 504 try { 505 url.fragment = value.percentDecode; 506 } catch (URLException) { 507 return false; 508 } 509 510 return true; 511 } 512 513 unittest { 514 { 515 // Basic. 516 URL url; 517 with (url) { 518 scheme = "https"; 519 host = "example.org"; 520 path = "/foo/bar"; 521 query["hello"] = "world"; 522 query["gibe"] = "clay"; 523 fragment = "frag"; 524 } 525 assert( 526 // Not sure what order it'll come out in. 527 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 528 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 529 url.toString); 530 } 531 { 532 // Percent encoded. 533 URL url; 534 with (url) { 535 scheme = "https"; 536 host = "example.org"; 537 path = "/f☃o"; 538 query["❄"] = "❀"; 539 query["["] = "]"; 540 fragment = "ş"; 541 } 542 assert( 543 // Not sure what order it'll come out in. 544 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 545 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 546 url.toString); 547 } 548 { 549 // Port, user, pass. 550 URL url; 551 with (url) { 552 scheme = "https"; 553 host = "example.org"; 554 user = "dhasenan"; 555 pass = "itsasecret"; 556 port = 17; 557 } 558 assert( 559 url.toString == "https://dhasenan:itsasecret@example.org:17/", 560 url.toString); 561 } 562 { 563 // Query with no path. 564 URL url; 565 with (url) { 566 scheme = "https"; 567 host = "example.org"; 568 query["hi"] = "bye"; 569 } 570 assert( 571 url.toString == "https://example.org/?hi=bye", 572 url.toString); 573 } 574 } 575 576 /// 577 unittest { 578 { 579 // Basic. 580 URL url; 581 with (url) { 582 scheme = "https"; 583 host = "example.org"; 584 path = "/foo/bar"; 585 queryParams.add("hello", "world"); 586 queryParams.add("gibe", "clay"); 587 fragment = "frag"; 588 } 589 assert( 590 // Not sure what order it'll come out in. 591 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 592 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 593 url.toString); 594 } 595 { 596 // Passing an array of query values. 597 URL url; 598 with (url) { 599 scheme = "https"; 600 host = "example.org"; 601 path = "/foo/bar"; 602 queryParams.add("hello", "world"); 603 queryParams.add("hello", "aether"); 604 fragment = "frag"; 605 } 606 assert( 607 // Not sure what order it'll come out in. 608 url.toString == "https://example.org/foo/bar?hello=world&hello=aether#frag" || 609 url.toString == "https://example.org/foo/bar?hello=aether&hello=world#frag", 610 url.toString); 611 } 612 { 613 // Percent encoded. 614 URL url; 615 with (url) { 616 scheme = "https"; 617 host = "example.org"; 618 path = "/f☃o"; 619 queryParams.add("❄", "❀"); 620 queryParams.add("[", "]"); 621 fragment = "ş"; 622 } 623 assert( 624 // Not sure what order it'll come out in. 625 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 626 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 627 url.toString); 628 } 629 { 630 // Port, user, pass. 631 URL url; 632 with (url) { 633 scheme = "https"; 634 host = "example.org"; 635 user = "dhasenan"; 636 pass = "itsasecret"; 637 port = 17; 638 } 639 assert( 640 url.toString == "https://dhasenan:itsasecret@example.org:17/", 641 url.toString); 642 } 643 { 644 // Query with no path. 645 URL url; 646 with (url) { 647 scheme = "https"; 648 host = "example.org"; 649 queryParams.add("hi", "bye"); 650 } 651 assert( 652 url.toString == "https://example.org/?hi=bye", 653 url.toString); 654 } 655 } 656 657 unittest { 658 // Percent decoding. 659 660 // http://#:!:@ 661 auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash"; 662 auto url = urlString.parseURL; 663 assert(url.user == "#"); 664 assert(url.pass == "!:"); 665 assert(url.host == "example.org"); 666 assert(url.path == "/{/}"); 667 assert(url.queryParams[";"].front == ""); 668 assert(url.queryParams["&"].front == "="); 669 assert(url.fragment == "#hash"); 670 671 // Round trip. 672 assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 673 assert(urlString == urlString.parseURL.toString.parseURL.toString); 674 } 675 676 unittest { 677 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 678 assert(url.host == "☂.☃.org", url.host); 679 } 680 681 unittest { 682 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 683 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 684 assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 685 } 686 687 unittest { 688 auto url = "https://☂.☃.org/?hi=bye".parseURL; 689 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 690 } 691 692 /// 693 unittest { 694 // There's an existing path. 695 auto url = parseURL("http://example.org/foo"); 696 // No slash? Assume it needs a slash. 697 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 698 // With slash? Don't add another. 699 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 700 url ~= "bar"; 701 assert(url.toString == "http://example.org/foo/bar"); 702 703 // Path already ends with a slash; don't add another. 704 url = parseURL("http://example.org/foo/"); 705 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 706 // Still don't add one even if you're appending with a slash. 707 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 708 url ~= "/bar"; 709 assert(url.toString == "http://example.org/foo/bar"); 710 711 // No path. 712 url = parseURL("http://example.org"); 713 assert((url ~ "bar").toString == "http://example.org/bar"); 714 assert((url ~ "/bar").toString == "http://example.org/bar"); 715 url ~= "bar"; 716 assert(url.toString == "http://example.org/bar"); 717 718 // Path is just a slash. 719 url = parseURL("http://example.org/"); 720 assert((url ~ "bar").toString == "http://example.org/bar"); 721 assert((url ~ "/bar").toString == "http://example.org/bar"); 722 url ~= "bar"; 723 assert(url.toString == "http://example.org/bar", url.toString); 724 725 // No path, just fragment. 726 url = "ircs://irc.freenode.com/#d".parseURL; 727 assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 728 } 729 730 unittest { 731 import std.net.curl; 732 auto url = "http://example.org".parseURL; 733 assert(is(typeof(std.net.curl.get(url)))); 734 } 735 736 /** 737 * Parse the input string as a URL. 738 * 739 * Throws: 740 * URLException if the string was in an incorrect format. 741 */ 742 URL parseURL(string value) { 743 URL url; 744 if (tryParseURL(value, url)) { 745 return url; 746 } 747 throw new URLException("failed to parse URL " ~ value); 748 } 749 750 /// 751 unittest { 752 { 753 // Infer scheme 754 auto u1 = parseURL("example.org"); 755 assert(u1.scheme == "http"); 756 assert(u1.host == "example.org"); 757 assert(u1.path == ""); 758 assert(u1.port == 80); 759 assert(u1.providedPort == 0); 760 assert(u1.fragment == ""); 761 } 762 { 763 // Simple host and scheme 764 auto u1 = parseURL("https://example.org"); 765 assert(u1.scheme == "https"); 766 assert(u1.host == "example.org"); 767 assert(u1.path == ""); 768 assert(u1.port == 443); 769 assert(u1.providedPort == 0); 770 } 771 { 772 // With path 773 auto u1 = parseURL("https://example.org/foo/bar"); 774 assert(u1.scheme == "https"); 775 assert(u1.host == "example.org"); 776 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 777 assert(u1.port == 443); 778 assert(u1.providedPort == 0); 779 } 780 { 781 // With explicit port 782 auto u1 = parseURL("https://example.org:1021/foo/bar"); 783 assert(u1.scheme == "https"); 784 assert(u1.host == "example.org"); 785 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 786 assert(u1.port == 1021); 787 assert(u1.providedPort == 1021); 788 } 789 { 790 // With user 791 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 792 assert(u1.scheme == "https"); 793 assert(u1.host == "example.org"); 794 assert(u1.path == "/foo/bar"); 795 assert(u1.port == 443); 796 assert(u1.user == "bob"); 797 assert(u1.pass == "secret"); 798 } 799 { 800 // With user, URL-encoded 801 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 802 assert(u1.scheme == "https"); 803 assert(u1.host == "example.org"); 804 assert(u1.path == "/foo/bar"); 805 assert(u1.port == 443); 806 assert(u1.user == "bob!"); 807 assert(u1.pass == "secret!?"); 808 } 809 { 810 // With user and port and path 811 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 812 assert(u1.scheme == "https"); 813 assert(u1.host == "example.org"); 814 assert(u1.path == "/foo/bar"); 815 assert(u1.port == 2210); 816 assert(u1.user == "bob"); 817 assert(u1.pass == "secret"); 818 assert(u1.fragment == ""); 819 } 820 { 821 // With query string 822 auto u1 = parseURL("https://example.org/?login=true"); 823 assert(u1.scheme == "https"); 824 assert(u1.host == "example.org"); 825 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 826 assert(u1.queryParams["login"].front == "true"); 827 assert(u1.fragment == ""); 828 } 829 { 830 // With query string and fragment 831 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 832 assert(u1.scheme == "https"); 833 assert(u1.host == "example.org"); 834 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 835 assert(u1.queryParams["login"].front == "true"); 836 assert(u1.fragment == "justkidding"); 837 } 838 { 839 // With URL-encoded values 840 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 841 assert(u1.scheme == "https"); 842 assert(u1.host == "example.org"); 843 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 844 assert(u1.queryParams["❄"].front == "="); 845 assert(u1.fragment == "^"); 846 } 847 } 848 849 unittest { 850 assert(parseURL("http://example.org").port == 80); 851 assert(parseURL("http://example.org:5326").port == 5326); 852 853 auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 854 assert(url.scheme == "redis"); 855 assert(url.user == "admin"); 856 assert(url.pass == "password"); 857 858 assert(parseURL("example.org").toString == "http://example.org/"); 859 assert(parseURL("http://example.org:80").toString == "http://example.org/"); 860 861 assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 862 } 863 864 /** 865 * Percent-encode a string. 866 * 867 * URL components cannot contain non-ASCII characters, and there are very few characters that are 868 * safe to include as URL components. Domain names using Unicode values use Punycode. For 869 * everything else, there is percent encoding. 870 */ 871 string percentEncode(string raw) { 872 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 873 // We *can* encode any other characters. 874 // We *should not* encode alpha, numeric, or -._~. 875 Appender!string app; 876 foreach (dchar d; raw) { 877 if (('a' <= d && 'z' >= d) || 878 ('A' <= d && 'Z' >= d) || 879 ('0' <= d && '9' >= d) || 880 d == '-' || d == '.' || d == '_' || d == '~') { 881 app ~= d; 882 continue; 883 } 884 // Something simple like a space character? Still in 7-bit ASCII? 885 // Then we get a single-character string out of it and just encode 886 // that one bit. 887 // Something not in 7-bit ASCII? Then we percent-encode each octet 888 // in the UTF-8 encoding (and hope the server understands UTF-8). 889 char[] c; 890 encode(c, d); 891 auto bytes = cast(ubyte[])c; 892 foreach (b; bytes) { 893 app ~= format("%%%02X", b); 894 } 895 } 896 return cast(string)app.data; 897 } 898 899 /// 900 unittest { 901 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 902 assert(percentEncode("~~--..__") == "~~--..__"); 903 assert(percentEncode("0123456789") == "0123456789"); 904 905 string e; 906 907 e = percentEncode("☃"); 908 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 909 } 910 911 /** 912 * Percent-decode a string. 913 * 914 * URL components cannot contain non-ASCII characters, and there are very few characters that are 915 * safe to include as URL components. Domain names using Unicode values use Punycode. For 916 * everything else, there is percent encoding. 917 * 918 * This explicitly ensures that the result is a valid UTF-8 string. 919 */ 920 @trusted string percentDecode(string encoded) { 921 ubyte[] raw = percentDecodeRaw(encoded); 922 // This cast is not considered @safe because it converts from one pointer type to another. 923 // However, it's 1-byte values in either case, no reference types, so this won't result in any 924 // memory safety errors. We also check for validity immediately. 925 auto s = cast(string) raw; 926 if (!s.isValid) { 927 // TODO(dhasenan): 928 throw new URLException("input contains invalid UTF data"); 929 } 930 return s; 931 } 932 933 /// 934 unittest { 935 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 936 assert(percentDecode("~~--..__") == "~~--..__"); 937 assert(percentDecode("0123456789") == "0123456789"); 938 939 string e; 940 941 e = percentDecode("%E2%98%83"); 942 assert(e == "☃", "expected a snowman but got" ~ e); 943 } 944 945 /** 946 * Percent-decode a string into a ubyte array. 947 * 948 * URL components cannot contain non-ASCII characters, and there are very few characters that are 949 * safe to include as URL components. Domain names using Unicode values use Punycode. For 950 * everything else, there is percent encoding. 951 * 952 * This yields a ubyte array and will not perform validation on the output. However, an improperly 953 * formatted input string will result in a URLException. 954 */ 955 ubyte[] percentDecodeRaw(string encoded) { 956 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 957 Appender!(ubyte[]) app; 958 for (int i = 0; i < encoded.length; i++) { 959 if (encoded[i] != '%') { 960 app ~= encoded[i]; 961 continue; 962 } 963 if (i >= encoded.length - 2) { 964 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 965 "percent symbol. Error at index " ~ i.to!string); 966 } 967 auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1])); 968 auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2])); 969 app ~= cast(ubyte)((b << 4) | c); 970 i += 2; 971 } 972 return app.data; 973 } 974 975 private string toPuny(string unicodeHostname) { 976 bool mustEncode = false; 977 foreach (i, dchar d; unicodeHostname) { 978 auto c = cast(uint) d; 979 if (c > 0x80) { 980 mustEncode = true; 981 break; 982 } 983 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 984 throw new URLException( 985 format( 986 "domain name '%s' contains illegal character '%s' at position %s", 987 unicodeHostname, d, i)); 988 } 989 } 990 if (!mustEncode) { 991 return unicodeHostname; 992 } 993 return unicodeHostname.split('.').map!punyEncode.join("."); 994 } 995 996 private string fromPuny(string hostname) { 997 return hostname.split('.').map!punyDecode.join("."); 998 } 999 1000 private { 1001 enum delimiter = '-'; 1002 enum marker = "xn--"; 1003 enum ulong damp = 700; 1004 enum ulong tmin = 1; 1005 enum ulong tmax = 26; 1006 enum ulong skew = 38; 1007 enum ulong base = 36; 1008 enum ulong initialBias = 72; 1009 enum dchar initialN = cast(dchar)128; 1010 1011 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 1012 if (firstTime) { 1013 delta /= damp; 1014 } else { 1015 delta /= 2; 1016 } 1017 delta += delta / numPoints; 1018 ulong k = 0; 1019 while (delta > ((base - tmin) * tmax) / 2) { 1020 delta /= (base - tmin); 1021 k += base; 1022 } 1023 return k + (((base - tmin + 1) * delta) / (delta + skew)); 1024 } 1025 } 1026 1027 /** 1028 * Encode the input string using the Punycode algorithm. 1029 * 1030 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1031 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1032 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1033 * 1034 * In order to puny-encode a domain name, you must split it into its components. The following will 1035 * typically suffice: 1036 * --- 1037 * auto domain = "☂.☃.com"; 1038 * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1039 * --- 1040 */ 1041 string punyEncode(string input) { 1042 ulong delta = 0; 1043 dchar n = initialN; 1044 auto i = 0; 1045 auto bias = initialBias; 1046 Appender!string output; 1047 output ~= marker; 1048 auto pushed = 0; 1049 auto codePoints = 0; 1050 foreach (dchar c; input) { 1051 codePoints++; 1052 if (c <= initialN) { 1053 output ~= c; 1054 pushed++; 1055 } 1056 } 1057 if (pushed < codePoints) { 1058 if (pushed > 0) { 1059 output ~= delimiter; 1060 } 1061 } else { 1062 // No encoding to do. 1063 return input; 1064 } 1065 bool first = true; 1066 while (pushed < codePoints) { 1067 auto best = dchar.max; 1068 foreach (dchar c; input) { 1069 if (n <= c && c < best) { 1070 best = c; 1071 } 1072 } 1073 if (best == dchar.max) { 1074 throw new URLException("failed to find a new codepoint to process during punyencode"); 1075 } 1076 delta += (best - n) * (pushed + 1); 1077 if (delta > uint.max) { 1078 // TODO better error message 1079 throw new URLException("overflow during punyencode"); 1080 } 1081 n = best; 1082 foreach (dchar c; input) { 1083 if (c < n) { 1084 delta++; 1085 } 1086 if (c == n) { 1087 ulong q = delta; 1088 auto k = base; 1089 while (true) { 1090 ulong t; 1091 if (k <= bias) { 1092 t = tmin; 1093 } else if (k >= bias + tmax) { 1094 t = tmax; 1095 } else { 1096 t = k - bias; 1097 } 1098 if (q < t) { 1099 break; 1100 } 1101 output ~= digitToBasic(t + ((q - t) % (base - t))); 1102 q = (q - t) / (base - t); 1103 k += base; 1104 } 1105 output ~= digitToBasic(q); 1106 pushed++; 1107 bias = adapt(delta, pushed, first); 1108 first = false; 1109 delta = 0; 1110 } 1111 } 1112 delta++; 1113 n++; 1114 } 1115 return cast(string)output.data; 1116 } 1117 1118 /** 1119 * Decode the input string using the Punycode algorithm. 1120 * 1121 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1122 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1123 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1124 * 1125 * In order to puny-decode a domain name, you must split it into its components. The following will 1126 * typically suffice: 1127 * --- 1128 * auto domain = "xn--m3h.xn--n3h.com"; 1129 * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1130 * --- 1131 */ 1132 string punyDecode(string input) { 1133 if (!input.startsWith(marker)) { 1134 return input; 1135 } 1136 input = input[marker.length..$]; 1137 1138 // let n = initial_n 1139 dchar n = cast(dchar)128; 1140 1141 // let i = 0 1142 // let bias = initial_bias 1143 // let output = an empty string indexed from 0 1144 ulong i = 0; 1145 auto bias = initialBias; 1146 dchar[] output; 1147 // This reserves a bit more than necessary, but it should be more efficient overall than just 1148 // appending and inserting volo-nolo. 1149 output.reserve(input.length); 1150 1151 // consume all code points before the last delimiter (if there is one) 1152 // and copy them to output, fail on any non-basic code point 1153 // if more than zero code points were consumed then consume one more 1154 // (which will be the last delimiter) 1155 auto end = input.lastIndexOf(delimiter); 1156 if (end > -1) { 1157 foreach (dchar c; input[0..end]) { 1158 output ~= c; 1159 } 1160 input = input[end+1 .. $]; 1161 } 1162 1163 // while the input is not exhausted do begin 1164 ulong pos = 0; 1165 while (pos < input.length) { 1166 // let oldi = i 1167 // let w = 1 1168 auto oldi = i; 1169 auto w = 1; 1170 // for k = base to infinity in steps of base do begin 1171 for (ulong k = base; k < uint.max; k += base) { 1172 // consume a code point, or fail if there was none to consume 1173 // Note that the input is all ASCII, so we can simply index the input string bytewise. 1174 auto c = input[pos]; 1175 pos++; 1176 // let digit = the code point's digit-value, fail if it has none 1177 auto digit = basicToDigit(c); 1178 // let i = i + digit * w, fail on overflow 1179 i += digit * w; 1180 // let t = tmin if k <= bias {+ tmin}, or 1181 // tmax if k >= bias + tmax, or k - bias otherwise 1182 ulong t; 1183 if (k <= bias) { 1184 t = tmin; 1185 } else if (k >= bias + tmax) { 1186 t = tmax; 1187 } else { 1188 t = k - bias; 1189 } 1190 // if digit < t then break 1191 if (digit < t) { 1192 break; 1193 } 1194 // let w = w * (base - t), fail on overflow 1195 w *= (base - t); 1196 // end 1197 } 1198 // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1199 bias = adapt(i - oldi, output.length + 1, oldi == 0); 1200 // let n = n + i div (length(output) + 1), fail on overflow 1201 n += i / (output.length + 1); 1202 // let i = i mod (length(output) + 1) 1203 i %= (output.length + 1); 1204 // {if n is a basic code point then fail} 1205 // (We aren't actually going to fail here; it's clear what this means.) 1206 // insert n into output at position i 1207 (() @trusted { output.insertInPlace(i, cast(dchar)n); })(); // should be @safe but isn't marked 1208 // increment i 1209 i++; 1210 // end 1211 } 1212 return output.to!string; 1213 } 1214 1215 // Lifted from punycode.js. 1216 private dchar digitToBasic(ulong digit) { 1217 return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1218 } 1219 1220 // Lifted from punycode.js. 1221 private uint basicToDigit(char c) { 1222 auto codePoint = cast(uint)c; 1223 if (codePoint - 48 < 10) { 1224 return codePoint - 22; 1225 } 1226 if (codePoint - 65 < 26) { 1227 return codePoint - 65; 1228 } 1229 if (codePoint - 97 < 26) { 1230 return codePoint - 97; 1231 } 1232 return base; 1233 } 1234 1235 unittest { 1236 { 1237 auto a = "b\u00FCcher"; 1238 assert(punyEncode(a) == "xn--bcher-kva"); 1239 } 1240 { 1241 auto a = "b\u00FCc\u00FCher"; 1242 assert(punyEncode(a) == "xn--bcher-kvab"); 1243 } 1244 { 1245 auto a = "ýbücher"; 1246 auto b = punyEncode(a); 1247 assert(b == "xn--bcher-kvaf", b); 1248 } 1249 1250 { 1251 auto a = "mañana"; 1252 assert(punyEncode(a) == "xn--maana-pta"); 1253 } 1254 1255 { 1256 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1257 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1258 auto b = punyEncode(a); 1259 assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1260 } 1261 import std.stdio; 1262 } 1263 1264 unittest { 1265 { 1266 auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1267 assert(b == "ليهمابتكلموشعربي؟", b); 1268 } 1269 { 1270 assert(punyDecode("xn--maana-pta") == "mañana"); 1271 } 1272 } 1273 1274 unittest { 1275 import std.string, std.algorithm, std.array, std.range; 1276 { 1277 auto domain = "xn--m3h.xn--n3h.com"; 1278 auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1279 assert(decodedDomain == "☂.☃.com", decodedDomain); 1280 } 1281 { 1282 auto domain = "☂.☃.com"; 1283 auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1284 assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1285 } 1286 }