1 /** 2 * A URL handling library. 3 * 4 * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 * elements like port, path, username, and password. 6 * 7 * This module aims to make it simple to muck about with them. 8 * 9 * Example usage: 10 * --- 11 * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 * auto files = system("ssh", url.toString, "ls").splitLines; 13 * foreach (file; files) { 14 * system("scp", url ~ file, "."); 15 * } 16 * --- 17 * 18 * License: The MIT license. 19 */ 20 module url; 21 22 import std.conv; 23 import std.string; 24 25 pure: 26 @safe: 27 28 /// An exception thrown when something bad happens with URLs. 29 class URLException : Exception 30 { 31 this(string msg) pure { super(msg); } 32 } 33 34 /** 35 * A mapping from schemes to their default ports. 36 * 37 * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 38 * use even if they use ports. Entries here should be treated as best guesses. 39 */ 40 enum ushort[string] schemeToDefaultPort = [ 41 "aaa": 3868, 42 "aaas": 5658, 43 "acap": 674, 44 "amqp": 5672, 45 "cap": 1026, 46 "coap": 5683, 47 "coaps": 5684, 48 "dav": 443, 49 "dict": 2628, 50 "ftp": 21, 51 "git": 9418, 52 "go": 1096, 53 "gopher": 70, 54 "http": 80, 55 "https": 443, 56 "ws": 80, 57 "wss": 443, 58 "iac": 4569, 59 "icap": 1344, 60 "imap": 143, 61 "ipp": 631, 62 "ipps": 631, // yes, they're both mapped to port 631 63 "irc": 6667, // De facto default port, not the IANA reserved port. 64 "ircs": 6697, 65 "iris": 702, // defaults to iris.beep 66 "iris.beep": 702, 67 "iris.lwz": 715, 68 "iris.xpc": 713, 69 "iris.xpcs": 714, 70 "jabber": 5222, // client-to-server 71 "ldap": 389, 72 "ldaps": 636, 73 "msrp": 2855, 74 "msrps": 2855, 75 "mtqp": 1038, 76 "mupdate": 3905, 77 "news": 119, 78 "nfs": 2049, 79 "pop": 110, 80 "redis": 6379, 81 "reload": 6084, 82 "rsync": 873, 83 "rtmfp": 1935, 84 "rtsp": 554, 85 "shttp": 80, 86 "sieve": 4190, 87 "sip": 5060, 88 "sips": 5061, 89 "smb": 445, 90 "smtp": 25, 91 "snews": 563, 92 "snmp": 161, 93 "soap.beep": 605, 94 "ssh": 22, 95 "stun": 3478, 96 "stuns": 5349, 97 "svn": 3690, 98 "teamspeak": 9987, 99 "telnet": 23, 100 "tftp": 69, 101 "tip": 3372, 102 ]; 103 104 /** 105 * A collection of query parameters. 106 * 107 * This is effectively a multimap of string -> strings. 108 */ 109 struct QueryParams 110 { 111 pure: 112 import std.typecons; 113 alias Tuple!(string, "key", string, "value") Param; 114 Param[] params; 115 116 @property size_t length() { 117 return params.length; 118 } 119 120 /// Get a range over the query parameter values for the given key. 121 auto opIndex(string key) 122 { 123 import std.algorithm.searching : find; 124 import std.algorithm.iteration : map; 125 return params.find!(x => x.key == key).map!(x => x.value); 126 } 127 128 /// Add a query parameter with the given key and value. 129 /// If one already exists, there will now be two query parameters with the given name. 130 void add(string key, string value) { 131 params ~= Param(key, value); 132 } 133 134 /// Add a query parameter with the given key and value. 135 /// If there are any existing parameters with the same key, they are removed and overwritten. 136 void overwrite(string key, string value) { 137 for (int i = 0; i < params.length; i++) { 138 if (params[i].key == key) { 139 params[i] = params[$-1]; 140 params.length--; 141 } 142 } 143 params ~= Param(key, value); 144 } 145 146 private struct QueryParamRange 147 { 148 pure: 149 size_t i; 150 const(Param)[] params; 151 bool empty() { return i >= params.length; } 152 void popFront() { i++; } 153 Param front() { return params[i]; } 154 } 155 156 /** 157 * A range over the query parameters. 158 * 159 * Usage: 160 * --- 161 * foreach (key, value; url.queryParams) {} 162 * --- 163 */ 164 auto range() { 165 return QueryParamRange(0, this.params); 166 } 167 /// ditto 168 alias range this; 169 170 /// Convert this set of query parameters into a query string. 171 string toString() { 172 import std.array : Appender; 173 Appender!string s; 174 bool first = true; 175 foreach (tuple; this) { 176 if (!first) { 177 s ~= '&'; 178 } 179 first = false; 180 s ~= tuple.key.percentEncode; 181 if (tuple.value.length > 0) { 182 s ~= '='; 183 s ~= tuple.value.percentEncode; 184 } 185 } 186 return s.data; 187 } 188 189 /// Clone this set of query parameters. 190 QueryParams dup() { 191 QueryParams other = this; 192 other.params = params.dup; 193 return other; 194 } 195 } 196 197 /** 198 * A Unique Resource Locator. 199 * 200 * URLs can be parsed (see parseURL) and implicitly convert to strings. 201 */ 202 struct URL 203 { 204 pure: 205 /// The URL scheme. For instance, ssh, ftp, or https. 206 string scheme; 207 208 /// The username in this URL. Usually absent. If present, there will also be a password. 209 string user; 210 211 /// The password in this URL. Usually absent. 212 string pass; 213 214 /// The hostname. 215 string host; 216 217 /** 218 * The port. 219 * 220 * This is inferred from the scheme if it isn't present in the URL itself. 221 * If the scheme is not known and the port is not present, the port will be given as 0. 222 * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 223 * 224 * If you explicitly need to detect whether the user provided a port, check the providedPort 225 * field. 226 */ 227 @property ushort port() { 228 if (providedPort != 0) { 229 return providedPort; 230 } 231 if (auto p = scheme in schemeToDefaultPort) { 232 return *p; 233 } 234 return 0; 235 } 236 237 /** 238 * Set the port. 239 * 240 * This sets the providedPort field and is provided for convenience. 241 */ 242 @property ushort port(ushort value) { 243 return providedPort = value; 244 } 245 246 /// The port that was explicitly provided in the URL. 247 ushort providedPort; 248 249 /** 250 * The path. 251 * 252 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 253 * "/news/story/17774". 254 */ 255 string path; 256 257 /** 258 * The query parameters associated with this URL. 259 */ 260 QueryParams queryParams; 261 262 /** 263 * The fragment. In web documents, this typically refers to an anchor element. 264 * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 265 */ 266 string fragment; 267 268 /** 269 * Convert this URL to a string. 270 * The string is properly formatted and usable for, eg, a web request. 271 */ 272 string toString() 273 { 274 return toString(false); 275 } 276 277 /** 278 * Convert this URL to a string. 279 * 280 * The string is intended to be human-readable rather than machine-readable. 281 */ 282 string toHumanReadableString() 283 { 284 return toString(true); 285 } 286 287 /// 288 unittest 289 { 290 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 291 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 292 assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 293 } 294 295 private string toString(bool humanReadable) { 296 import std.array : Appender; 297 Appender!string s; 298 s ~= scheme; 299 s ~= "://"; 300 if (user) { 301 s ~= humanReadable ? user : user.percentEncode; 302 s ~= ":"; 303 s ~= humanReadable ? pass : pass.percentEncode; 304 s ~= "@"; 305 } 306 s ~= humanReadable ? host : host.toPuny; 307 if (providedPort) { 308 if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 309 s ~= ":"; 310 s ~= providedPort.to!string; 311 } 312 } 313 string p = path; 314 if (p.length == 0 || p == "/") { 315 s ~= '/'; 316 } else { 317 if (p[0] == '/') { 318 p = p[1..$]; 319 } 320 if (humanReadable) { 321 s ~= p; 322 } else { 323 foreach (part; p.split('/')) { 324 s ~= '/'; 325 s ~= part.percentEncode; 326 } 327 } 328 } 329 if (queryParams.length) { 330 s ~= '?'; 331 s ~= queryParams.toString; 332 } if (fragment) { 333 s ~= '#'; 334 s ~= fragment.percentEncode; 335 } 336 return s.data; 337 } 338 339 /// Implicitly convert URLs to strings. 340 alias toString this; 341 342 /** 343 * The append operator (~). 344 * 345 * The append operator for URLs returns a new URL with the given string appended as a path 346 * element to the URL's path. It only adds new path elements (or sequences of path elements). 347 * 348 * Don't worry about path separators; whether you include them or not, it will just work. 349 * 350 * Query elements are copied. 351 * 352 * Examples: 353 * --- 354 * auto random = "http://testdata.org/random".parseURL; 355 * auto randInt = random ~ "int"; 356 * writeln(randInt); // prints "http://testdata.org/random/int" 357 * --- 358 */ 359 URL opBinary(string op : "~")(string subsequentPath) { 360 URL other = this; 361 other ~= subsequentPath; 362 other.queryParams = queryParams.dup; 363 return other; 364 } 365 366 /** 367 * The append-in-place operator (~=). 368 * 369 * The append operator for URLs adds a path element to this URL. It only adds new path elements 370 * (or sequences of path elements). 371 * 372 * Don't worry about path separators; whether you include them or not, it will just work. 373 * 374 * Examples: 375 * --- 376 * auto random = "http://testdata.org/random".parseURL; 377 * random ~= "int"; 378 * writeln(random); // prints "http://testdata.org/random/int" 379 * --- 380 */ 381 URL opOpAssign(string op : "~")(string subsequentPath) { 382 if (path.endsWith("/")) { 383 if (subsequentPath.startsWith("/")) { 384 path ~= subsequentPath[1..$]; 385 } else { 386 path ~= subsequentPath; 387 } 388 } else { 389 if (!subsequentPath.startsWith("/")) { 390 path ~= '/'; 391 } 392 path ~= subsequentPath; 393 } 394 return this; 395 } 396 397 /** 398 * Convert a relative URL to an absolute URL. 399 * 400 * This is designed so that you can scrape a webpage and quickly convert links within the 401 * page to URLs you can actually work with, but you're clever; I'm sure you'll find more uses 402 * for it. 403 * 404 * It's biased toward HTTP family URLs; as one quirk, "//" is interpreted as "same scheme, 405 * different everything else", which might not be desirable for all schemes. 406 * 407 * This only handles URLs, not URIs; if you pass in 'mailto:bob.dobbs@subgenius.org', for 408 * instance, this will give you our best attempt to parse it as a URL. 409 * 410 * Examples: 411 * --- 412 * auto base = "https://example.org/passworddb?secure=false".parseURL; 413 * 414 * // Download https://example.org/passworddb/by-username/dhasenan 415 * download(base.resolve("by-username/dhasenan")); 416 * 417 * // Download https://example.org/static/style.css 418 * download(base.resolve("/static/style.css")); 419 * 420 * // Download https://cdn.example.net/jquery.js 421 * download(base.resolve("https://cdn.example.net/jquery.js")); 422 * --- 423 */ 424 URL resolve(string other) 425 { 426 if (other.length == 0) return this; 427 if (other[0] == '/') 428 { 429 if (other.length > 1 && other[1] == '/') 430 { 431 // Uncommon syntax: a link like "//wikimedia.org" means "same scheme, switch URL" 432 return parseURL(this.scheme ~ ':' ~ other); 433 } 434 } 435 else if (other.indexOf("://") > other.indexOf("/")) 436 { 437 // separate URL 438 return other.parseURL; 439 } 440 441 URL ret = this; 442 ret.path = ""; 443 ret.queryParams = ret.queryParams.init; 444 if (other[0] != '/') 445 { 446 // relative to something 447 if (!this.path.length) 448 { 449 // nothing to be relative to 450 other = "/" ~ other; 451 } 452 else if (this.path[$-1] == '/') 453 { 454 // directory-style path for the current thing 455 // resolve relative to this directory 456 other = this.path ~ other; 457 } 458 else 459 { 460 // this is a file-like thing 461 // find the 'directory' and relative to that 462 other = this.path[0..this.path.lastIndexOf('/') + 1] ~ other; 463 } 464 } 465 parsePathAndQuery(ret, other); 466 return ret; 467 } 468 } 469 470 /** 471 * Parse a URL from a string. 472 * 473 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 474 * may be made. However, any URL in a correct format will be parsed correctly. 475 */ 476 bool tryParseURL(string value, out URL url) 477 { 478 url = URL.init; 479 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 480 // Scheme is optional in common use. We infer 'http' if it's not given. 481 auto i = value.indexOf("//"); 482 if (i > -1) { 483 if (i > 1) { 484 url.scheme = value[0..i-1]; 485 } 486 value = value[i+2 .. $]; 487 } else { 488 url.scheme = "http"; 489 } 490 // Check for an ipv6 hostname. 491 // [user:password@]host[:port]][/]path[?query][#fragment 492 i = value.indexOfAny([':', '/', '[']); 493 if (i == -1) { 494 // Just a hostname. 495 url.host = value.fromPuny; 496 return true; 497 } 498 499 if (value[i] == ':') { 500 // This could be between username and password, or it could be between host and port. 501 auto j = value.indexOfAny(['@', '/']); 502 if (j > -1 && value[j] == '@') { 503 try { 504 url.user = value[0..i].percentDecode; 505 url.pass = value[i+1 .. j].percentDecode; 506 } catch (URLException) { 507 return false; 508 } 509 value = value[j+1 .. $]; 510 } 511 } 512 513 // It's trying to be a host/port, not a user/pass. 514 i = value.indexOfAny([':', '/', '[']); 515 if (i == -1) { 516 url.host = value.fromPuny; 517 return true; 518 } 519 520 // Find the hostname. It's either an ipv6 address (which has special rules) or not (which doesn't 521 // have special rules). -- The main sticking point is that ipv6 addresses have colons, which we 522 // handle specially, and are offset with square brackets. 523 if (value[i] == '[') { 524 auto j = value[i..$].indexOf(']'); 525 if (j < 0) { 526 // unterminated ipv6 addr 527 return false; 528 } 529 // includes square brackets 530 url.host = value[i .. i+j+1]; 531 value = value[i+j+1 .. $]; 532 if (value.length == 0) { 533 // read to end of string; we finished parse 534 return true; 535 } 536 if (value[0] != ':' && value[0] != '?' && value[0] != '/') { 537 return false; 538 } 539 } else { 540 // Normal host. 541 url.host = value[0..i].fromPuny; 542 value = value[i .. $]; 543 } 544 545 if (value[0] == ':') { 546 auto end = value.indexOf('/'); 547 if (end == -1) { 548 end = value.length; 549 } 550 try { 551 url.port = value[1 .. end].to!ushort; 552 } catch (ConvException) { 553 return false; 554 } 555 value = value[end .. $]; 556 if (value.length == 0) { 557 return true; 558 } 559 } 560 return parsePathAndQuery(url, value); 561 } 562 563 private bool parsePathAndQuery(ref URL url, string value) 564 { 565 auto i = value.indexOfAny("?#"); 566 if (i == -1) 567 { 568 url.path = value.percentDecode; 569 return true; 570 } 571 572 try 573 { 574 url.path = value[0..i].percentDecode; 575 } 576 catch (URLException) 577 { 578 return false; 579 } 580 581 auto c = value[i]; 582 value = value[i + 1 .. $]; 583 if (c == '?') 584 { 585 i = value.indexOf('#'); 586 string query; 587 if (i < 0) 588 { 589 query = value; 590 value = null; 591 } 592 else 593 { 594 query = value[0..i]; 595 value = value[i + 1 .. $]; 596 } 597 auto queries = query.split('&'); 598 foreach (q; queries) 599 { 600 auto j = q.indexOf('='); 601 string key, val; 602 if (j < 0) 603 { 604 key = q; 605 } 606 else 607 { 608 key = q[0..j]; 609 val = q[j + 1 .. $]; 610 } 611 try 612 { 613 key = key.percentDecode; 614 val = val.percentDecode; 615 } 616 catch (URLException) 617 { 618 return false; 619 } 620 url.queryParams.add(key, val); 621 } 622 } 623 624 try 625 { 626 url.fragment = value.percentDecode; 627 } 628 catch (URLException) 629 { 630 return false; 631 } 632 633 return true; 634 } 635 636 unittest { 637 { 638 // Basic. 639 URL url; 640 with (url) { 641 scheme = "https"; 642 host = "example.org"; 643 path = "/foo/bar"; 644 queryParams.add("hello", "world"); 645 queryParams.add("gibe", "clay"); 646 fragment = "frag"; 647 } 648 assert( 649 // Not sure what order it'll come out in. 650 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 651 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 652 url.toString); 653 } 654 { 655 // Percent encoded. 656 URL url; 657 with (url) { 658 scheme = "https"; 659 host = "example.org"; 660 path = "/f☃o"; 661 queryParams.add("❄", "❀"); 662 queryParams.add("[", "]"); 663 fragment = "ş"; 664 } 665 assert( 666 // Not sure what order it'll come out in. 667 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 668 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 669 url.toString); 670 } 671 { 672 // Port, user, pass. 673 URL url; 674 with (url) { 675 scheme = "https"; 676 host = "example.org"; 677 user = "dhasenan"; 678 pass = "itsasecret"; 679 port = 17; 680 } 681 assert( 682 url.toString == "https://dhasenan:itsasecret@example.org:17/", 683 url.toString); 684 } 685 { 686 // Query with no path. 687 URL url; 688 with (url) { 689 scheme = "https"; 690 host = "example.org"; 691 queryParams.add("hi", "bye"); 692 } 693 assert( 694 url.toString == "https://example.org/?hi=bye", 695 url.toString); 696 } 697 } 698 699 unittest 700 { 701 auto url = "//foo/bar".parseURL; 702 assert(url.host == "foo", "expected host foo, got " ~ url.host); 703 assert(url.path == "/bar"); 704 } 705 706 unittest 707 { 708 // ipv6 hostnames! 709 { 710 // full range of data 711 auto url = parseURL("https://bob:secret@[::1]:2771/foo/bar"); 712 assert(url.scheme == "https", url.scheme); 713 assert(url.user == "bob", url.user); 714 assert(url.pass == "secret", url.pass); 715 assert(url.host == "[::1]", url.host); 716 assert(url.port == 2771, url.port.to!string); 717 assert(url.path == "/foo/bar", url.path); 718 } 719 720 // minimal 721 { 722 auto url = parseURL("[::1]"); 723 assert(url.host == "[::1]", url.host); 724 } 725 726 // some random bits 727 { 728 auto url = parseURL("http://[::1]/foo"); 729 assert(url.scheme == "http", url.scheme); 730 assert(url.host == "[::1]", url.host); 731 assert(url.path == "/foo", url.path); 732 } 733 734 { 735 auto url = parseURL("https://[2001:0db8:0:0:0:0:1428:57ab]/?login=true#justkidding"); 736 assert(url.scheme == "https"); 737 assert(url.host == "[2001:0db8:0:0:0:0:1428:57ab]"); 738 assert(url.path == "/"); 739 assert(url.fragment == "justkidding"); 740 } 741 } 742 743 unittest 744 { 745 auto url = "localhost:5984".parseURL; 746 auto url2 = url ~ "db1"; 747 assert(url2.toString == "http://localhost:5984/db1", url2.toString); 748 auto url3 = url2 ~ "_all_docs"; 749 assert(url3.toString == "http://localhost:5984/db1/_all_docs", url3.toString); 750 } 751 752 /// 753 unittest { 754 { 755 // Basic. 756 URL url; 757 with (url) { 758 scheme = "https"; 759 host = "example.org"; 760 path = "/foo/bar"; 761 queryParams.add("hello", "world"); 762 queryParams.add("gibe", "clay"); 763 fragment = "frag"; 764 } 765 assert( 766 // Not sure what order it'll come out in. 767 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 768 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 769 url.toString); 770 } 771 { 772 // Passing an array of query values. 773 URL url; 774 with (url) { 775 scheme = "https"; 776 host = "example.org"; 777 path = "/foo/bar"; 778 queryParams.add("hello", "world"); 779 queryParams.add("hello", "aether"); 780 fragment = "frag"; 781 } 782 assert( 783 // Not sure what order it'll come out in. 784 url.toString == "https://example.org/foo/bar?hello=world&hello=aether#frag" || 785 url.toString == "https://example.org/foo/bar?hello=aether&hello=world#frag", 786 url.toString); 787 } 788 { 789 // Percent encoded. 790 URL url; 791 with (url) { 792 scheme = "https"; 793 host = "example.org"; 794 path = "/f☃o"; 795 queryParams.add("❄", "❀"); 796 queryParams.add("[", "]"); 797 fragment = "ş"; 798 } 799 assert( 800 // Not sure what order it'll come out in. 801 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 802 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 803 url.toString); 804 } 805 { 806 // Port, user, pass. 807 URL url; 808 with (url) { 809 scheme = "https"; 810 host = "example.org"; 811 user = "dhasenan"; 812 pass = "itsasecret"; 813 port = 17; 814 } 815 assert( 816 url.toString == "https://dhasenan:itsasecret@example.org:17/", 817 url.toString); 818 } 819 { 820 // Query with no path. 821 URL url; 822 with (url) { 823 scheme = "https"; 824 host = "example.org"; 825 queryParams.add("hi", "bye"); 826 } 827 assert( 828 url.toString == "https://example.org/?hi=bye", 829 url.toString); 830 } 831 } 832 833 unittest { 834 // Percent decoding. 835 836 // http://#:!:@ 837 auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash"; 838 auto url = urlString.parseURL; 839 assert(url.user == "#"); 840 assert(url.pass == "!:"); 841 assert(url.host == "example.org"); 842 assert(url.path == "/{/}"); 843 assert(url.queryParams[";"].front == ""); 844 assert(url.queryParams["&"].front == "="); 845 assert(url.fragment == "#hash"); 846 847 // Round trip. 848 assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 849 assert(urlString == urlString.parseURL.toString.parseURL.toString); 850 } 851 852 unittest { 853 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 854 assert(url.host == "☂.☃.org", url.host); 855 } 856 857 unittest { 858 auto url = "https://☂.☃.org/?hi=bye".parseURL; 859 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 860 } 861 862 /// 863 unittest { 864 // There's an existing path. 865 auto url = parseURL("http://example.org/foo"); 866 URL url2; 867 // No slash? Assume it needs a slash. 868 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 869 // With slash? Don't add another. 870 url2 = url ~ "/bar"; 871 assert(url2.toString == "http://example.org/foo/bar", url2.toString); 872 url ~= "bar"; 873 assert(url.toString == "http://example.org/foo/bar"); 874 875 // Path already ends with a slash; don't add another. 876 url = parseURL("http://example.org/foo/"); 877 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 878 // Still don't add one even if you're appending with a slash. 879 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 880 url ~= "/bar"; 881 assert(url.toString == "http://example.org/foo/bar"); 882 883 // No path. 884 url = parseURL("http://example.org"); 885 assert((url ~ "bar").toString == "http://example.org/bar"); 886 assert((url ~ "/bar").toString == "http://example.org/bar"); 887 url ~= "bar"; 888 assert(url.toString == "http://example.org/bar"); 889 890 // Path is just a slash. 891 url = parseURL("http://example.org/"); 892 assert((url ~ "bar").toString == "http://example.org/bar"); 893 assert((url ~ "/bar").toString == "http://example.org/bar"); 894 url ~= "bar"; 895 assert(url.toString == "http://example.org/bar", url.toString); 896 897 // No path, just fragment. 898 url = "ircs://irc.freenode.com/#d".parseURL; 899 assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 900 } 901 unittest 902 { 903 // basic resolve() 904 { 905 auto base = "https://example.org/this/".parseURL; 906 assert(base.resolve("that") == "https://example.org/this/that"); 907 assert(base.resolve("/that") == "https://example.org/that"); 908 assert(base.resolve("//example.net/that") == "https://example.net/that"); 909 } 910 911 // ensure we don't preserve query params 912 { 913 auto base = "https://example.org/this?query=value&other=value2".parseURL; 914 assert(base.resolve("that") == "https://example.org/that"); 915 assert(base.resolve("/that") == "https://example.org/that"); 916 assert(base.resolve("//example.net/that") == "https://example.net/that"); 917 } 918 } 919 920 921 unittest 922 { 923 import std.net.curl; 924 auto url = "http://example.org".parseURL; 925 assert(is(typeof(std.net.curl.get(url)))); 926 } 927 928 /** 929 * Parse the input string as a URL. 930 * 931 * Throws: 932 * URLException if the string was in an incorrect format. 933 */ 934 URL parseURL(string value) { 935 URL url; 936 if (tryParseURL(value, url)) { 937 return url; 938 } 939 throw new URLException("failed to parse URL " ~ value); 940 } 941 942 /// 943 unittest { 944 { 945 // Infer scheme 946 auto u1 = parseURL("example.org"); 947 assert(u1.scheme == "http"); 948 assert(u1.host == "example.org"); 949 assert(u1.path == ""); 950 assert(u1.port == 80); 951 assert(u1.providedPort == 0); 952 assert(u1.fragment == ""); 953 } 954 { 955 // Simple host and scheme 956 auto u1 = parseURL("https://example.org"); 957 assert(u1.scheme == "https"); 958 assert(u1.host == "example.org"); 959 assert(u1.path == ""); 960 assert(u1.port == 443); 961 assert(u1.providedPort == 0); 962 } 963 { 964 // With path 965 auto u1 = parseURL("https://example.org/foo/bar"); 966 assert(u1.scheme == "https"); 967 assert(u1.host == "example.org"); 968 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 969 assert(u1.port == 443); 970 assert(u1.providedPort == 0); 971 } 972 { 973 // With explicit port 974 auto u1 = parseURL("https://example.org:1021/foo/bar"); 975 assert(u1.scheme == "https"); 976 assert(u1.host == "example.org"); 977 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 978 assert(u1.port == 1021); 979 assert(u1.providedPort == 1021); 980 } 981 { 982 // With user 983 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 984 assert(u1.scheme == "https"); 985 assert(u1.host == "example.org"); 986 assert(u1.path == "/foo/bar"); 987 assert(u1.port == 443); 988 assert(u1.user == "bob"); 989 assert(u1.pass == "secret"); 990 } 991 { 992 // With user, URL-encoded 993 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 994 assert(u1.scheme == "https"); 995 assert(u1.host == "example.org"); 996 assert(u1.path == "/foo/bar"); 997 assert(u1.port == 443); 998 assert(u1.user == "bob!"); 999 assert(u1.pass == "secret!?"); 1000 } 1001 { 1002 // With user and port and path 1003 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 1004 assert(u1.scheme == "https"); 1005 assert(u1.host == "example.org"); 1006 assert(u1.path == "/foo/bar"); 1007 assert(u1.port == 2210); 1008 assert(u1.user == "bob"); 1009 assert(u1.pass == "secret"); 1010 assert(u1.fragment == ""); 1011 } 1012 { 1013 // With query string 1014 auto u1 = parseURL("https://example.org/?login=true"); 1015 assert(u1.scheme == "https"); 1016 assert(u1.host == "example.org"); 1017 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 1018 assert(u1.queryParams["login"].front == "true"); 1019 assert(u1.fragment == ""); 1020 } 1021 { 1022 // With query string and fragment 1023 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 1024 assert(u1.scheme == "https"); 1025 assert(u1.host == "example.org"); 1026 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 1027 assert(u1.queryParams["login"].front == "true"); 1028 assert(u1.fragment == "justkidding"); 1029 } 1030 { 1031 // With URL-encoded values 1032 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 1033 assert(u1.scheme == "https"); 1034 assert(u1.host == "example.org"); 1035 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 1036 assert(u1.queryParams["❄"].front == "="); 1037 assert(u1.fragment == "^"); 1038 } 1039 } 1040 1041 unittest { 1042 assert(parseURL("http://example.org").port == 80); 1043 assert(parseURL("http://example.org:5326").port == 5326); 1044 1045 auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 1046 assert(url.scheme == "redis"); 1047 assert(url.user == "admin"); 1048 assert(url.pass == "password"); 1049 1050 assert(parseURL("example.org").toString == "http://example.org/"); 1051 assert(parseURL("http://example.org:80").toString == "http://example.org/"); 1052 1053 assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 1054 } 1055 1056 /** 1057 * Percent-encode a string. 1058 * 1059 * URL components cannot contain non-ASCII characters, and there are very few characters that are 1060 * safe to include as URL components. Domain names using Unicode values use Punycode. For 1061 * everything else, there is percent encoding. 1062 */ 1063 string percentEncode(string raw) { 1064 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 1065 // We *can* encode any other characters. 1066 // We *should not* encode alpha, numeric, or -._~. 1067 import std.utf : encode; 1068 import std.array : Appender; 1069 Appender!string app; 1070 foreach (dchar d; raw) { 1071 if (('a' <= d && 'z' >= d) || 1072 ('A' <= d && 'Z' >= d) || 1073 ('0' <= d && '9' >= d) || 1074 d == '-' || d == '.' || d == '_' || d == '~') { 1075 app ~= d; 1076 continue; 1077 } 1078 // Something simple like a space character? Still in 7-bit ASCII? 1079 // Then we get a single-character string out of it and just encode 1080 // that one bit. 1081 // Something not in 7-bit ASCII? Then we percent-encode each octet 1082 // in the UTF-8 encoding (and hope the server understands UTF-8). 1083 char[] c; 1084 encode(c, d); 1085 auto bytes = cast(ubyte[])c; 1086 foreach (b; bytes) { 1087 app ~= format("%%%02X", b); 1088 } 1089 } 1090 return cast(string)app.data; 1091 } 1092 1093 /// 1094 unittest { 1095 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 1096 assert(percentEncode("~~--..__") == "~~--..__"); 1097 assert(percentEncode("0123456789") == "0123456789"); 1098 1099 string e; 1100 1101 e = percentEncode("☃"); 1102 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 1103 } 1104 1105 /** 1106 * Percent-decode a string. 1107 * 1108 * URL components cannot contain non-ASCII characters, and there are very few characters that are 1109 * safe to include as URL components. Domain names using Unicode values use Punycode. For 1110 * everything else, there is percent encoding. 1111 * 1112 * This explicitly ensures that the result is a valid UTF-8 string. 1113 */ 1114 string percentDecode(string encoded) 1115 { 1116 import std.utf : validate, UTFException; 1117 auto raw = percentDecodeRaw(encoded); 1118 auto s = cast(string) raw; 1119 try 1120 { 1121 validate(s); 1122 } 1123 catch (UTFException e) 1124 { 1125 throw new URLException( 1126 "The percent-encoded data `" ~ encoded ~ "` does not represent a valid UTF-8 sequence."); 1127 } 1128 return s; 1129 } 1130 1131 /// 1132 unittest { 1133 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 1134 assert(percentDecode("~~--..__") == "~~--..__"); 1135 assert(percentDecode("0123456789") == "0123456789"); 1136 1137 string e; 1138 1139 e = percentDecode("%E2%98%83"); 1140 assert(e == "☃", "expected a snowman but got" ~ e); 1141 1142 e = percentDecode("%e2%98%83"); 1143 assert(e == "☃", "expected a snowman but got" ~ e); 1144 1145 try { 1146 // %ES is an invalid percent sequence: 'S' is not a hex digit. 1147 percentDecode("%es"); 1148 assert(false, "expected exception not thrown"); 1149 } catch (URLException) { 1150 } 1151 1152 try { 1153 percentDecode("%e"); 1154 assert(false, "expected exception not thrown"); 1155 } catch (URLException) { 1156 } 1157 } 1158 1159 /** 1160 * Percent-decode a string into a ubyte array. 1161 * 1162 * URL components cannot contain non-ASCII characters, and there are very few characters that are 1163 * safe to include as URL components. Domain names using Unicode values use Punycode. For 1164 * everything else, there is percent encoding. 1165 * 1166 * This yields a ubyte array and will not perform validation on the output. However, an improperly 1167 * formatted input string will result in a URLException. 1168 */ 1169 immutable(ubyte)[] percentDecodeRaw(string encoded) 1170 { 1171 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 1172 import std.array : Appender; 1173 Appender!(immutable(ubyte)[]) app; 1174 for (int i = 0; i < encoded.length; i++) { 1175 if (encoded[i] != '%') { 1176 app ~= encoded[i]; 1177 continue; 1178 } 1179 if (i >= encoded.length - 2) { 1180 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 1181 "percent symbol. Error at index " ~ i.to!string); 1182 } 1183 if (isHex(encoded[i + 1]) && isHex(encoded[i + 2])) { 1184 auto b = fromHex(encoded[i + 1]); 1185 auto c = fromHex(encoded[i + 2]); 1186 app ~= cast(ubyte)((b << 4) | c); 1187 } else { 1188 throw new URLException("Invalid percent encoded value: expected two hex digits after " ~ 1189 "percent symbol. Error at index " ~ i.to!string); 1190 } 1191 i += 2; 1192 } 1193 return app.data; 1194 } 1195 1196 private bool isHex(char c) { 1197 return ('0' <= c && '9' >= c) || 1198 ('a' <= c && 'f' >= c) || 1199 ('A' <= c && 'F' >= c); 1200 } 1201 1202 private ubyte fromHex(char s) { 1203 enum caseDiff = 'a' - 'A'; 1204 if (s >= 'a' && s <= 'z') { 1205 s -= caseDiff; 1206 } 1207 return cast(ubyte)("0123456789ABCDEF".indexOf(s)); 1208 } 1209 1210 private string toPuny(string unicodeHostname) 1211 { 1212 bool mustEncode = false; 1213 foreach (i, dchar d; unicodeHostname) { 1214 auto c = cast(uint) d; 1215 if (c > 0x80) { 1216 mustEncode = true; 1217 break; 1218 } 1219 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 1220 throw new URLException( 1221 format( 1222 "domain name '%s' contains illegal character '%s' at position %s", 1223 unicodeHostname, d, i)); 1224 } 1225 } 1226 if (!mustEncode) { 1227 return unicodeHostname; 1228 } 1229 import std.algorithm.iteration : map; 1230 return unicodeHostname.split('.').map!punyEncode.join("."); 1231 } 1232 1233 private string fromPuny(string hostname) 1234 { 1235 import std.algorithm.iteration : map; 1236 return hostname.split('.').map!punyDecode.join("."); 1237 } 1238 1239 private { 1240 enum delimiter = '-'; 1241 enum marker = "xn--"; 1242 enum ulong damp = 700; 1243 enum ulong tmin = 1; 1244 enum ulong tmax = 26; 1245 enum ulong skew = 38; 1246 enum ulong base = 36; 1247 enum ulong initialBias = 72; 1248 enum dchar initialN = cast(dchar)128; 1249 1250 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 1251 if (firstTime) { 1252 delta /= damp; 1253 } else { 1254 delta /= 2; 1255 } 1256 delta += delta / numPoints; 1257 ulong k = 0; 1258 while (delta > ((base - tmin) * tmax) / 2) { 1259 delta /= (base - tmin); 1260 k += base; 1261 } 1262 return k + (((base - tmin + 1) * delta) / (delta + skew)); 1263 } 1264 } 1265 1266 /** 1267 * Encode the input string using the Punycode algorithm. 1268 * 1269 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1270 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1271 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1272 * 1273 * In order to puny-encode a domain name, you must split it into its components. The following will 1274 * typically suffice: 1275 * --- 1276 * auto domain = "☂.☃.com"; 1277 * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1278 * --- 1279 */ 1280 string punyEncode(string input) 1281 { 1282 import std.array : Appender; 1283 ulong delta = 0; 1284 dchar n = initialN; 1285 auto i = 0; 1286 auto bias = initialBias; 1287 Appender!string output; 1288 output ~= marker; 1289 auto pushed = 0; 1290 auto codePoints = 0; 1291 foreach (dchar c; input) { 1292 codePoints++; 1293 if (c <= initialN) { 1294 output ~= c; 1295 pushed++; 1296 } 1297 } 1298 if (pushed < codePoints) { 1299 if (pushed > 0) { 1300 output ~= delimiter; 1301 } 1302 } else { 1303 // No encoding to do. 1304 return input; 1305 } 1306 bool first = true; 1307 while (pushed < codePoints) { 1308 auto best = dchar.max; 1309 foreach (dchar c; input) { 1310 if (n <= c && c < best) { 1311 best = c; 1312 } 1313 } 1314 if (best == dchar.max) { 1315 throw new URLException("failed to find a new codepoint to process during punyencode"); 1316 } 1317 delta += (best - n) * (pushed + 1); 1318 if (delta > uint.max) { 1319 // TODO better error message 1320 throw new URLException("overflow during punyencode"); 1321 } 1322 n = best; 1323 foreach (dchar c; input) { 1324 if (c < n) { 1325 delta++; 1326 } 1327 if (c == n) { 1328 ulong q = delta; 1329 auto k = base; 1330 while (true) { 1331 ulong t; 1332 if (k <= bias) { 1333 t = tmin; 1334 } else if (k >= bias + tmax) { 1335 t = tmax; 1336 } else { 1337 t = k - bias; 1338 } 1339 if (q < t) { 1340 break; 1341 } 1342 output ~= digitToBasic(t + ((q - t) % (base - t))); 1343 q = (q - t) / (base - t); 1344 k += base; 1345 } 1346 output ~= digitToBasic(q); 1347 pushed++; 1348 bias = adapt(delta, pushed, first); 1349 first = false; 1350 delta = 0; 1351 } 1352 } 1353 delta++; 1354 n++; 1355 } 1356 return cast(string)output.data; 1357 } 1358 1359 /** 1360 * Decode the input string using the Punycode algorithm. 1361 * 1362 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1363 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1364 * in Punycode, you will get "xn--m3h.xn--n3h.com". 1365 * 1366 * In order to puny-decode a domain name, you must split it into its components. The following will 1367 * typically suffice: 1368 * --- 1369 * auto domain = "xn--m3h.xn--n3h.com"; 1370 * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1371 * --- 1372 */ 1373 string punyDecode(string input) { 1374 if (!input.startsWith(marker)) { 1375 return input; 1376 } 1377 input = input[marker.length..$]; 1378 1379 // let n = initial_n 1380 dchar n = cast(dchar)128; 1381 1382 // let i = 0 1383 // let bias = initial_bias 1384 // let output = an empty string indexed from 0 1385 size_t i = 0; 1386 auto bias = initialBias; 1387 dchar[] output; 1388 // This reserves a bit more than necessary, but it should be more efficient overall than just 1389 // appending and inserting volo-nolo. 1390 output.reserve(input.length); 1391 1392 // consume all code points before the last delimiter (if there is one) 1393 // and copy them to output, fail on any non-basic code point 1394 // if more than zero code points were consumed then consume one more 1395 // (which will be the last delimiter) 1396 auto end = input.lastIndexOf(delimiter); 1397 if (end > -1) { 1398 foreach (dchar c; input[0..end]) { 1399 output ~= c; 1400 } 1401 input = input[end+1 .. $]; 1402 } 1403 1404 // while the input is not exhausted do begin 1405 size_t pos = 0; 1406 while (pos < input.length) { 1407 // let oldi = i 1408 // let w = 1 1409 auto oldi = i; 1410 auto w = 1; 1411 // for k = base to infinity in steps of base do begin 1412 for (ulong k = base; k < uint.max; k += base) { 1413 // consume a code point, or fail if there was none to consume 1414 // Note that the input is all ASCII, so we can simply index the input string bytewise. 1415 auto c = input[pos]; 1416 pos++; 1417 // let digit = the code point's digit-value, fail if it has none 1418 auto digit = basicToDigit(c); 1419 // let i = i + digit * w, fail on overflow 1420 i += digit * w; 1421 // let t = tmin if k <= bias {+ tmin}, or 1422 // tmax if k >= bias + tmax, or k - bias otherwise 1423 ulong t; 1424 if (k <= bias) { 1425 t = tmin; 1426 } else if (k >= bias + tmax) { 1427 t = tmax; 1428 } else { 1429 t = k - bias; 1430 } 1431 // if digit < t then break 1432 if (digit < t) { 1433 break; 1434 } 1435 // let w = w * (base - t), fail on overflow 1436 w *= (base - t); 1437 // end 1438 } 1439 // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1440 bias = adapt(i - oldi, output.length + 1, oldi == 0); 1441 // let n = n + i div (length(output) + 1), fail on overflow 1442 n += i / (output.length + 1); 1443 // let i = i mod (length(output) + 1) 1444 i %= (output.length + 1); 1445 // {if n is a basic code point then fail} 1446 // (We aren't actually going to fail here; it's clear what this means.) 1447 // insert n into output at position i 1448 import std.array : insertInPlace; 1449 (() @trusted { output.insertInPlace(i, cast(dchar)n); })(); // should be @safe but isn't marked 1450 // increment i 1451 i++; 1452 // end 1453 } 1454 return output.to!string; 1455 } 1456 1457 // Lifted from punycode.js. 1458 private dchar digitToBasic(ulong digit) { 1459 return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1460 } 1461 1462 // Lifted from punycode.js. 1463 private uint basicToDigit(char c) { 1464 auto codePoint = cast(uint)c; 1465 if (codePoint - 48 < 10) { 1466 return codePoint - 22; 1467 } 1468 if (codePoint - 65 < 26) { 1469 return codePoint - 65; 1470 } 1471 if (codePoint - 97 < 26) { 1472 return codePoint - 97; 1473 } 1474 return base; 1475 } 1476 1477 unittest { 1478 { 1479 auto a = "b\u00FCcher"; 1480 assert(punyEncode(a) == "xn--bcher-kva"); 1481 } 1482 { 1483 auto a = "b\u00FCc\u00FCher"; 1484 assert(punyEncode(a) == "xn--bcher-kvab"); 1485 } 1486 { 1487 auto a = "ýbücher"; 1488 auto b = punyEncode(a); 1489 assert(b == "xn--bcher-kvaf", b); 1490 } 1491 1492 { 1493 auto a = "mañana"; 1494 assert(punyEncode(a) == "xn--maana-pta"); 1495 } 1496 1497 { 1498 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1499 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1500 auto b = punyEncode(a); 1501 assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1502 } 1503 import std.stdio; 1504 } 1505 1506 unittest { 1507 { 1508 auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1509 assert(b == "ليهمابتكلموشعربي؟", b); 1510 } 1511 { 1512 assert(punyDecode("xn--maana-pta") == "mañana"); 1513 } 1514 } 1515 1516 unittest { 1517 import std.string, std.algorithm, std.array, std.range; 1518 { 1519 auto domain = "xn--m3h.xn--n3h.com"; 1520 auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1521 assert(decodedDomain == "☂.☃.com", decodedDomain); 1522 } 1523 { 1524 auto domain = "☂.☃.com"; 1525 auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1526 assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1527 } 1528 }