1 /** 2 * A URL handling library. 3 * 4 * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 * elements like port, path, username, and password. 6 * 7 * This module aims to make it simple to muck about with them. 8 * 9 * Example usage: 10 * --- 11 * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 * auto files = system("ssh", url.toString, "ls").splitLines; 13 * foreach (file; files) { 14 * system("scp", url ~ file, "."); 15 * } 16 * --- 17 * 18 * License: The MIT license. 19 */ 20 module url; 21 22 import std.algorithm; 23 import std.array; 24 import std.conv; 25 import std.encoding; 26 import std.string; 27 import std.utf; 28 29 /// An exception thrown when something bad happens with URLs. 30 class URLException : Exception { 31 this(string msg) { super(msg); } 32 } 33 34 /** 35 * A mapping from schemes to their default ports. 36 * 37 * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 38 * use even if they use ports. Entries here should be treated as best guesses. 39 */ 40 ushort[string] schemeToDefaultPort; 41 42 static this() { 43 schemeToDefaultPort = [ 44 "aaa": 3868, 45 "aaas": 5658, 46 "acap": 674, 47 "cap": 1026, 48 "coap": 5683, 49 "coaps": 5684, 50 "dav": 443, 51 "dict": 2628, 52 "ftp": 21, 53 "git": 9418, 54 "go": 1096, 55 "gopher": 70, 56 "http": 80, 57 "https": 443, 58 "iac": 4569, 59 "icap": 1344, 60 "imap": 143, 61 "ipp": 631, 62 "ipps": 631, // yes, they're both mapped to port 631 63 "irc": 6667, // De facto default port, not the IANA reserved port. 64 "ircs": 6697, 65 "iris": 702, // defaults to iris.beep 66 "iris.beep": 702, 67 "iris.lwz": 715, 68 "iris.xpc": 713, 69 "iris.xpcs": 714, 70 "jabber": 5222, // client-to-server 71 "ldap": 389, 72 "ldaps": 636, 73 "msrp": 2855, 74 "msrps": 2855, 75 "mtqp": 1038, 76 "mupdate": 3905, 77 "news": 119, 78 "nfs": 2049, 79 "pop": 110, 80 "redis": 6379, 81 "reload": 6084, 82 "rsync": 873, 83 "rtmfp": 1935, 84 "rtsp": 554, 85 "shttp": 80, 86 "sieve": 4190, 87 "sip": 5060, 88 "sips": 5061, 89 "smb": 445, 90 "smtp": 25, 91 "snews": 563, 92 "snmp": 161, 93 "soap.beep": 605, 94 "ssh": 22, 95 "stun": 3478, 96 "stuns": 5349, 97 "svn": 3690, 98 "teamspeak": 9987, 99 "telnet": 23, 100 "tftp": 69, 101 "tip": 3372, 102 ]; 103 } 104 105 /** 106 * A Unique Resource Locator. 107 * 108 * URLs can be parsed (see parseURL) and implicitly convert to strings. 109 */ 110 struct URL { 111 /// The URL scheme. For instance, ssh, ftp, or https. 112 string scheme; 113 114 /// The username in this URL. Usually absent. If present, there will also be a password. 115 string user; 116 117 /// The password in this URL. Usually absent. 118 string pass; 119 120 /// The hostname. 121 string host; 122 123 /** 124 * The port. 125 * 126 * This is inferred from the scheme if it isn't present in the URL itself. 127 * If the scheme is not known and the port is not present, the port will be given as 0. 128 * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 129 * 130 * If you explicitly need to detect whether the user provided a port, check the providedPort 131 * field. 132 */ 133 @property ushort port() { 134 if (providedPort != 0) { 135 return providedPort; 136 } 137 if (auto p = scheme in schemeToDefaultPort) { 138 return *p; 139 } 140 return 0; 141 } 142 143 /** 144 * Set the port. 145 * 146 * This sets the providedPort field and is provided for convenience. 147 */ 148 @property ushort port(ushort value) { 149 return providedPort = value; 150 } 151 152 /// The port that was explicitly provided in the URL. 153 ushort providedPort; 154 155 /** 156 * The path. 157 * 158 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 159 * "/news/story/17774". 160 */ 161 string path; 162 163 /** 164 * The query string elements. 165 * 166 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string 167 * elements will be ["visited": "false"]. 168 * 169 * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be 170 * ["item": ""]. 171 * 172 * This field is mutable, so be cautious. 173 */ 174 string[string] query; 175 176 /** 177 * The fragment. In web documents, this typically refers to an anchor element. 178 * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 179 */ 180 string fragment; 181 182 /** 183 * Convert this URL to a string. 184 * The string is properly formatted and usable for, eg, a web request. 185 */ 186 string toString() { 187 return toString(false); 188 } 189 190 /** 191 * Convert this URL to a string. 192 * The string is intended to be human-readable rather than machine-readable. 193 */ 194 string toHumanReadableString() { 195 return toString(true); 196 } 197 198 private string toString(bool humanReadable) { 199 Appender!string s; 200 s ~= scheme; 201 s ~= "://"; 202 if (user) { 203 s ~= humanReadable ? user : user.percentEncode; 204 s ~= ":"; 205 s ~= humanReadable ? pass : pass.percentEncode; 206 s ~= "@"; 207 } 208 s ~= humanReadable ? host : host.toPuny; 209 if (providedPort) { 210 if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 211 s ~= ":"; 212 s ~= providedPort.to!string; 213 } 214 } 215 string p = path; 216 if (p.length == 0 || p == "/") { 217 s ~= '/'; 218 } else { 219 if (p[0] == '/') { 220 p = p[1..$]; 221 } 222 if (humanReadable) { 223 s ~= p; 224 } else { 225 foreach (part; p.split('/')) { 226 s ~= '/'; 227 s ~= part.percentEncode; 228 } 229 } 230 } 231 if (query) { 232 s ~= '?'; 233 bool first = true; 234 foreach (k, v; query) { 235 if (!first) { 236 s ~= '&'; 237 } 238 first = false; 239 s ~= k.percentEncode; 240 if (v.length > 0) { 241 s ~= '='; 242 s ~= v.percentEncode; 243 } 244 } 245 } 246 if (fragment) { 247 s ~= '#'; 248 s ~= fragment.percentEncode; 249 } 250 return s.data; 251 } 252 253 /// Implicitly convert URLs to strings. 254 alias toString this; 255 256 /** 257 * The append operator (~). 258 * 259 * The append operator for URLs returns a new URL with the given string appended as a path 260 * element to the URL's path. It only adds new path elements (or sequences of path elements). 261 * 262 * Don't worry about path separators; whether you include them or not, it will just work. 263 * 264 * Query elements are copied. 265 * 266 * Examples: 267 * --- 268 * auto random = "http://testdata.org/random".parseURL; 269 * auto randInt = random ~ "int"; 270 * writeln(randInt); // prints "http://testdata.org/random/int" 271 * --- 272 */ 273 URL opBinary(string op : "~")(string subsequentPath) { 274 URL other = this; 275 other ~= subsequentPath; 276 if (query) { 277 other.query = other.query.dup; 278 } 279 return other; 280 } 281 282 /** 283 * The append-in-place operator (~=). 284 * 285 * The append operator for URLs adds a path element to this URL. It only adds new path elements 286 * (or sequences of path elements). 287 * 288 * Don't worry about path separators; whether you include them or not, it will just work. 289 * 290 * Examples: 291 * --- 292 * auto random = "http://testdata.org/random".parseURL; 293 * random ~= "int"; 294 * writeln(random); // prints "http://testdata.org/random/int" 295 * --- 296 */ 297 URL opOpAssign(string op : "~")(string subsequentPath) { 298 if (path.endsWith("/") || subsequentPath.startsWith("/")) { 299 if (path.endsWith("/") && subsequentPath.startsWith("/")) { 300 path ~= subsequentPath[1..$]; 301 } else { 302 path ~= subsequentPath; 303 } 304 } else { 305 path ~= '/'; 306 path ~= subsequentPath; 307 } 308 return this; 309 } 310 } 311 312 /** 313 * Parse a URL from a string. 314 * 315 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 316 * may be made. However, any URL in a correct format will be parsed correctly. 317 */ 318 bool tryParseURL(string value, out URL url) { 319 url = URL.init; 320 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 321 // Scheme is optional in common use. We infer 'http' if it's not given. 322 auto i = value.indexOf("://"); 323 if (i > -1) { 324 url.scheme = value[0..i]; 325 value = value[i+3 .. $]; 326 } else { 327 url.scheme = "http"; 328 } 329 // [user:password@]host[:port]][/]path[?query][#fragment 330 i = value.indexOfAny([':', '/']); 331 if (i == -1) { 332 // Just a hostname. 333 url.host = value.fromPuny; 334 return true; 335 } 336 337 if (value[i] == ':') { 338 // This could be between username and password, or it could be between host and port. 339 auto j = value.indexOfAny(['@', '/']); 340 if (j > -1 && value[j] == '@') { 341 try { 342 url.user = value[0..i].percentDecode; 343 url.pass = value[i+1 .. j].percentDecode; 344 } catch (URLException) { 345 return false; 346 } 347 value = value[j+1 .. $]; 348 } 349 } 350 351 // It's trying to be a host/port, not a user/pass. 352 i = value.indexOfAny([':', '/']); 353 if (i == -1) { 354 url.host = value.fromPuny; 355 return true; 356 } 357 url.host = value[0..i].fromPuny; 358 value = value[i .. $]; 359 if (value[0] == ':') { 360 auto end = value.indexOf('/'); 361 if (end == -1) { 362 end = value.length; 363 } 364 try { 365 url.port = value[1 .. end].to!ushort; 366 } catch (ConvException) { 367 return false; 368 } 369 value = value[end .. $]; 370 if (value.length == 0) { 371 return true; 372 } 373 } 374 375 i = value.indexOfAny("?#"); 376 if (i == -1) { 377 url.path = value.percentDecode; 378 return true; 379 } 380 381 try { 382 url.path = value[0..i].percentDecode; 383 } catch (URLException) { 384 return false; 385 } 386 auto c = value[i]; 387 value = value[i + 1 .. $]; 388 if (c == '?') { 389 i = value.indexOf('#'); 390 string query; 391 if (i < 0) { 392 query = value; 393 value = null; 394 } else { 395 query = value[0..i]; 396 value = value[i + 1 .. $]; 397 } 398 auto queries = query.split('&'); 399 foreach (q; queries) { 400 auto j = q.indexOf('='); 401 try { 402 if (j == -1) { 403 url.query[q.percentDecode] = ""; 404 } else { 405 url.query[q[0..j].percentDecode] = q[j + 1 .. $].percentDecode; 406 } 407 } catch (URLException) { 408 return false; 409 } 410 } 411 } 412 413 try { 414 url.fragment = value.percentDecode; 415 } catch (URLException) { 416 return false; 417 } 418 419 return true; 420 } 421 422 /// 423 unittest { 424 { 425 // Basic. 426 URL url; 427 with (url) { 428 scheme = "https"; 429 host = "example.org"; 430 path = "/foo/bar"; 431 query["hello"] = "world"; 432 query["gibe"] = "clay"; 433 fragment = "frag"; 434 } 435 assert( 436 // Not sure what order it'll come out in. 437 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 438 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 439 url.toString); 440 } 441 { 442 // Percent encoded. 443 URL url; 444 with (url) { 445 scheme = "https"; 446 host = "example.org"; 447 path = "/f☃o"; 448 query["❄"] = "❀"; 449 query["["] = "]"; 450 fragment = "ş"; 451 } 452 assert( 453 // Not sure what order it'll come out in. 454 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 455 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 456 url.toString); 457 } 458 { 459 // Port, user, pass. 460 URL url; 461 with (url) { 462 scheme = "https"; 463 host = "example.org"; 464 user = "dhasenan"; 465 pass = "itsasecret"; 466 port = 17; 467 } 468 assert( 469 url.toString == "https://dhasenan:itsasecret@example.org:17/", 470 url.toString); 471 } 472 { 473 // Query with no path. 474 URL url; 475 with (url) { 476 scheme = "https"; 477 host = "example.org"; 478 query["hi"] = "bye"; 479 } 480 assert( 481 url.toString == "https://example.org/?hi=bye", 482 url.toString); 483 } 484 } 485 486 unittest { 487 // Percent decoding. 488 489 // http://#:!:@ 490 auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash"; 491 auto url = urlString.parseURL; 492 assert(url.user == "#"); 493 assert(url.pass == "!:"); 494 assert(url.host == "example.org"); 495 assert(url.path == "/{/}"); 496 assert(url.query[";"] == ""); 497 assert(url.query["&"] == "="); 498 assert(url.fragment == "#hash"); 499 500 // Round trip. 501 assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 502 assert(urlString == urlString.parseURL.toString.parseURL.toString); 503 } 504 505 unittest { 506 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 507 assert(url.host == "☂.☃.org", url.host); 508 } 509 510 unittest { 511 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 512 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 513 assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 514 } 515 516 unittest { 517 auto url = "https://☂.☃.org/?hi=bye".parseURL; 518 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 519 } 520 521 /// 522 unittest { 523 // There's an existing path. 524 auto url = parseURL("http://example.org/foo"); 525 // No slash? Assume it needs a slash. 526 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 527 // With slash? Don't add another. 528 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 529 url ~= "bar"; 530 assert(url.toString == "http://example.org/foo/bar"); 531 532 // Path already ends with a slash; don't add another. 533 url = parseURL("http://example.org/foo/"); 534 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 535 // Still don't add one even if you're appending with a slash. 536 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 537 url ~= "/bar"; 538 assert(url.toString == "http://example.org/foo/bar"); 539 540 // No path. 541 url = parseURL("http://example.org"); 542 assert((url ~ "bar").toString == "http://example.org/bar"); 543 assert((url ~ "/bar").toString == "http://example.org/bar"); 544 url ~= "bar"; 545 assert(url.toString == "http://example.org/bar"); 546 547 // Path is just a slash. 548 url = parseURL("http://example.org/"); 549 assert((url ~ "bar").toString == "http://example.org/bar"); 550 assert((url ~ "/bar").toString == "http://example.org/bar"); 551 url ~= "bar"; 552 assert(url.toString == "http://example.org/bar", url.toString); 553 554 // No path, just fragment. 555 url = "ircs://irc.freenode.com/#d".parseURL; 556 assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 557 } 558 559 unittest { 560 import std.net.curl; 561 auto url = "http://example.org".parseURL; 562 assert(is(typeof(std.net.curl.get(url)))); 563 } 564 565 /** 566 * Parse the input string as a URL. 567 * 568 * Throws: 569 * URLException if the string was in an incorrect format. 570 */ 571 URL parseURL(string value) { 572 URL url; 573 if (tryParseURL(value, url)) { 574 return url; 575 } 576 throw new URLException("failed to parse URL " ~ value); 577 } 578 579 /// 580 unittest { 581 { 582 // Infer scheme 583 auto u1 = parseURL("example.org"); 584 assert(u1.scheme == "http"); 585 assert(u1.host == "example.org"); 586 assert(u1.path == ""); 587 assert(u1.port == 80); 588 assert(u1.providedPort == 0); 589 assert(u1.fragment == ""); 590 } 591 { 592 // Simple host and scheme 593 auto u1 = parseURL("https://example.org"); 594 assert(u1.scheme == "https"); 595 assert(u1.host == "example.org"); 596 assert(u1.path == ""); 597 assert(u1.port == 443); 598 assert(u1.providedPort == 0); 599 } 600 { 601 // With path 602 auto u1 = parseURL("https://example.org/foo/bar"); 603 assert(u1.scheme == "https"); 604 assert(u1.host == "example.org"); 605 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 606 assert(u1.port == 443); 607 assert(u1.providedPort == 0); 608 } 609 { 610 // With explicit port 611 auto u1 = parseURL("https://example.org:1021/foo/bar"); 612 assert(u1.scheme == "https"); 613 assert(u1.host == "example.org"); 614 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 615 assert(u1.port == 1021); 616 assert(u1.providedPort == 1021); 617 } 618 { 619 // With user 620 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 621 assert(u1.scheme == "https"); 622 assert(u1.host == "example.org"); 623 assert(u1.path == "/foo/bar"); 624 assert(u1.port == 443); 625 assert(u1.user == "bob"); 626 assert(u1.pass == "secret"); 627 } 628 { 629 // With user, URL-encoded 630 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 631 assert(u1.scheme == "https"); 632 assert(u1.host == "example.org"); 633 assert(u1.path == "/foo/bar"); 634 assert(u1.port == 443); 635 assert(u1.user == "bob!"); 636 assert(u1.pass == "secret!?"); 637 } 638 { 639 // With user and port and path 640 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 641 assert(u1.scheme == "https"); 642 assert(u1.host == "example.org"); 643 assert(u1.path == "/foo/bar"); 644 assert(u1.port == 2210); 645 assert(u1.user == "bob"); 646 assert(u1.pass == "secret"); 647 assert(u1.fragment == ""); 648 } 649 { 650 // With query string 651 auto u1 = parseURL("https://example.org/?login=true"); 652 assert(u1.scheme == "https"); 653 assert(u1.host == "example.org"); 654 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 655 assert(u1.query["login"] == "true"); 656 assert(u1.fragment == ""); 657 } 658 { 659 // With query string and fragment 660 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 661 assert(u1.scheme == "https"); 662 assert(u1.host == "example.org"); 663 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 664 assert(u1.query["login"] == "true"); 665 assert(u1.fragment == "justkidding"); 666 } 667 { 668 // With URL-encoded values 669 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 670 assert(u1.scheme == "https"); 671 assert(u1.host == "example.org"); 672 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 673 assert(u1.query["❄"] == "="); 674 assert(u1.fragment == "^"); 675 } 676 } 677 678 unittest { 679 assert(parseURL("http://example.org").port == 80); 680 assert(parseURL("http://example.org:5326").port == 5326); 681 682 auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 683 assert(url.scheme == "redis"); 684 assert(url.user == "admin"); 685 assert(url.pass == "password"); 686 687 assert(parseURL("example.org").toString == "http://example.org/"); 688 assert(parseURL("http://example.org:80").toString == "http://example.org/"); 689 690 assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 691 } 692 693 /** 694 * Percent-encode a string. 695 * 696 * URL components cannot contain non-ASCII characters, and there are very few characters that are 697 * safe to include as URL components. Domain names using Unicode values use Punycode. For 698 * everything else, there is percent encoding. 699 */ 700 string percentEncode(string raw) { 701 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 702 // We *can* encode any other characters. 703 // We *should not* encode alpha, numeric, or -._~. 704 Appender!string app; 705 foreach (dchar d; raw) { 706 if (('a' <= d && 'z' >= d) || 707 ('A' <= d && 'Z' >= d) || 708 ('0' <= d && '9' >= d) || 709 d == '-' || d == '.' || d == '_' || d == '~') { 710 app ~= d; 711 continue; 712 } 713 // Something simple like a space character? Still in 7-bit ASCII? 714 // Then we get a single-character string out of it and just encode 715 // that one bit. 716 // Something not in 7-bit ASCII? Then we percent-encode each octet 717 // in the UTF-8 encoding (and hope the server understands UTF-8). 718 char[] c; 719 encode(c, d); 720 auto bytes = cast(ubyte[])c; 721 foreach (b; bytes) { 722 app ~= format("%%%02X", b); 723 } 724 } 725 return cast(string)app.data; 726 } 727 728 /// 729 unittest { 730 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 731 assert(percentEncode("~~--..__") == "~~--..__"); 732 assert(percentEncode("0123456789") == "0123456789"); 733 734 string e; 735 736 e = percentEncode("☃"); 737 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 738 } 739 740 /** 741 * Percent-decode a string. 742 * 743 * URL components cannot contain non-ASCII characters, and there are very few characters that are 744 * safe to include as URL components. Domain names using Unicode values use Punycode. For 745 * everything else, there is percent encoding. 746 * 747 * This explicitly ensures that the result is a valid UTF-8 string. 748 */ 749 string percentDecode(string encoded) { 750 ubyte[] raw = percentDecodeRaw(encoded); 751 auto s = cast(string) raw; 752 if (!s.isValid) { 753 // TODO(dhasenan): 754 throw new URLException("input contains invalid UTF data"); 755 } 756 return s; 757 } 758 759 /// 760 unittest { 761 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 762 assert(percentDecode("~~--..__") == "~~--..__"); 763 assert(percentDecode("0123456789") == "0123456789"); 764 765 string e; 766 767 e = percentDecode("%E2%98%83"); 768 assert(e == "☃", "expected a snowman but got" ~ e); 769 } 770 771 /** 772 * Percent-decode a string into a ubyte array. 773 * 774 * URL components cannot contain non-ASCII characters, and there are very few characters that are 775 * safe to include as URL components. Domain names using Unicode values use Punycode. For 776 * everything else, there is percent encoding. 777 * 778 * This yields a ubyte array and will not perform validation on the output. However, an improperly 779 * formatted input string will result in a URLException. 780 */ 781 ubyte[] percentDecodeRaw(string encoded) { 782 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 783 Appender!(ubyte[]) app; 784 for (int i = 0; i < encoded.length; i++) { 785 if (encoded[i] != '%') { 786 app ~= encoded[i]; 787 continue; 788 } 789 if (i >= encoded.length - 2) { 790 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 791 "percent symbol. Error at index " ~ i.to!string); 792 } 793 auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1])); 794 auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2])); 795 app ~= cast(ubyte)((b << 4) | c); 796 i += 2; 797 } 798 return app.data; 799 } 800 801 private string toPuny(string unicodeHostname) { 802 bool mustEncode = false; 803 foreach (i, dchar d; unicodeHostname) { 804 auto c = cast(uint) d; 805 if (c > 0x80) { 806 mustEncode = true; 807 break; 808 } 809 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 810 throw new URLException( 811 format( 812 "domain name '%s' contains illegal character '%s' at position %s", 813 unicodeHostname, d, i)); 814 } 815 } 816 if (!mustEncode) { 817 return unicodeHostname; 818 } 819 return unicodeHostname.split('.').map!punyEncode.join("."); 820 } 821 822 private string fromPuny(string hostname) { 823 return hostname.split('.').map!punyDecode.join("."); 824 } 825 826 private { 827 enum delimiter = '-'; 828 enum marker = "xn--"; 829 enum ulong damp = 700; 830 enum ulong tmin = 1; 831 enum ulong tmax = 26; 832 enum ulong skew = 38; 833 enum ulong base = 36; 834 enum ulong initialBias = 72; 835 enum dchar initialN = cast(dchar)128; 836 837 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 838 if (firstTime) { 839 delta /= damp; 840 } else { 841 delta /= 2; 842 } 843 delta += delta / numPoints; 844 ulong k = 0; 845 while (delta > ((base - tmin) * tmax) / 2) { 846 delta /= (base - tmin); 847 k += base; 848 } 849 return k + (((base - tmin + 1) * delta) / (delta + skew)); 850 } 851 } 852 853 /** 854 * Encode the input string using the Punycode algorithm. 855 * 856 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 857 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 858 * in Punycode, you will get "xn--m3h.xn--n3h.com". 859 * 860 * In order to puny-encode a domain name, you must split it into its components. The following will 861 * typically suffice: 862 * --- 863 * auto domain = "☂.☃.com"; 864 * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 865 * --- 866 */ 867 string punyEncode(string input) { 868 ulong delta = 0; 869 dchar n = initialN; 870 auto i = 0; 871 auto bias = initialBias; 872 Appender!string output; 873 output ~= marker; 874 auto pushed = 0; 875 auto codePoints = 0; 876 foreach (dchar c; input) { 877 codePoints++; 878 if (c <= initialN) { 879 output ~= c; 880 pushed++; 881 } 882 } 883 if (pushed < codePoints) { 884 if (pushed > 0) { 885 output ~= delimiter; 886 } 887 } else { 888 // No encoding to do. 889 return input; 890 } 891 bool first = true; 892 while (pushed < codePoints) { 893 auto best = dchar.max; 894 foreach (dchar c; input) { 895 if (n <= c && c < best) { 896 best = c; 897 } 898 } 899 if (best == dchar.max) { 900 throw new URLException("failed to find a new codepoint to process during punyencode"); 901 } 902 delta += (best - n) * (pushed + 1); 903 if (delta > uint.max) { 904 // TODO better error message 905 throw new URLException("overflow during punyencode"); 906 } 907 n = best; 908 foreach (dchar c; input) { 909 if (c < n) { 910 delta++; 911 } 912 if (c == n) { 913 ulong q = delta; 914 auto k = base; 915 while (true) { 916 ulong t; 917 if (k <= bias) { 918 t = tmin; 919 } else if (k >= bias + tmax) { 920 t = tmax; 921 } else { 922 t = k - bias; 923 } 924 if (q < t) { 925 break; 926 } 927 output ~= digitToBasic(t + ((q - t) % (base - t))); 928 q = (q - t) / (base - t); 929 k += base; 930 } 931 output ~= digitToBasic(q); 932 pushed++; 933 bias = adapt(delta, pushed, first); 934 first = false; 935 delta = 0; 936 } 937 } 938 delta++; 939 n++; 940 } 941 return cast(string)output.data; 942 } 943 944 /** 945 * Decode the input string using the Punycode algorithm. 946 * 947 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 948 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 949 * in Punycode, you will get "xn--m3h.xn--n3h.com". 950 * 951 * In order to puny-decode a domain name, you must split it into its components. The following will 952 * typically suffice: 953 * --- 954 * auto domain = "xn--m3h.xn--n3h.com"; 955 * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 956 * --- 957 */ 958 string punyDecode(string input) { 959 if (!input.startsWith(marker)) { 960 return input; 961 } 962 input = input[marker.length..$]; 963 964 // let n = initial_n 965 dchar n = cast(dchar)128; 966 967 // let i = 0 968 // let bias = initial_bias 969 // let output = an empty string indexed from 0 970 ulong i = 0; 971 auto bias = initialBias; 972 dchar[] output; 973 // This reserves a bit more than necessary, but it should be more efficient overall than just 974 // appending and inserting volo-nolo. 975 output.reserve(input.length); 976 977 // consume all code points before the last delimiter (if there is one) 978 // and copy them to output, fail on any non-basic code point 979 // if more than zero code points were consumed then consume one more 980 // (which will be the last delimiter) 981 auto end = input.lastIndexOf(delimiter); 982 if (end > -1) { 983 foreach (dchar c; input[0..end]) { 984 output ~= c; 985 } 986 input = input[end+1 .. $]; 987 } 988 989 // while the input is not exhausted do begin 990 ulong pos = 0; 991 while (pos < input.length) { 992 // let oldi = i 993 // let w = 1 994 auto oldi = i; 995 auto w = 1; 996 // for k = base to infinity in steps of base do begin 997 for (ulong k = base; k < uint.max; k += base) { 998 // consume a code point, or fail if there was none to consume 999 // Note that the input is all ASCII, so we can simply index the input string bytewise. 1000 auto c = input[pos]; 1001 pos++; 1002 // let digit = the code point's digit-value, fail if it has none 1003 auto digit = basicToDigit(c); 1004 // let i = i + digit * w, fail on overflow 1005 i += digit * w; 1006 // let t = tmin if k <= bias {+ tmin}, or 1007 // tmax if k >= bias + tmax, or k - bias otherwise 1008 ulong t; 1009 if (k <= bias) { 1010 t = tmin; 1011 } else if (k >= bias + tmax) { 1012 t = tmax; 1013 } else { 1014 t = k - bias; 1015 } 1016 // if digit < t then break 1017 if (digit < t) { 1018 break; 1019 } 1020 // let w = w * (base - t), fail on overflow 1021 w *= (base - t); 1022 // end 1023 } 1024 // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1025 bias = adapt(i - oldi, output.length + 1, oldi == 0); 1026 // let n = n + i div (length(output) + 1), fail on overflow 1027 n += i / (output.length + 1); 1028 // let i = i mod (length(output) + 1) 1029 i %= (output.length + 1); 1030 // {if n is a basic code point then fail} 1031 // (We aren't actually going to fail here; it's clear what this means.) 1032 // insert n into output at position i 1033 output.insertInPlace(i, cast(dchar)n); 1034 // increment i 1035 i++; 1036 // end 1037 } 1038 return output.to!string; 1039 } 1040 1041 // Lifted from punycode.js. 1042 private dchar digitToBasic(ulong digit) { 1043 return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1044 } 1045 1046 // Lifted from punycode.js. 1047 private uint basicToDigit(char c) { 1048 auto codePoint = cast(uint)c; 1049 if (codePoint - 48 < 10) { 1050 return codePoint - 22; 1051 } 1052 if (codePoint - 65 < 26) { 1053 return codePoint - 65; 1054 } 1055 if (codePoint - 97 < 26) { 1056 return codePoint - 97; 1057 } 1058 return base; 1059 } 1060 1061 unittest { 1062 { 1063 auto a = "b\u00FCcher"; 1064 assert(punyEncode(a) == "xn--bcher-kva"); 1065 } 1066 { 1067 auto a = "b\u00FCc\u00FCher"; 1068 assert(punyEncode(a) == "xn--bcher-kvab"); 1069 } 1070 { 1071 auto a = "ýbücher"; 1072 auto b = punyEncode(a); 1073 assert(b == "xn--bcher-kvaf", b); 1074 } 1075 1076 { 1077 auto a = "mañana"; 1078 assert(punyEncode(a) == "xn--maana-pta"); 1079 } 1080 1081 { 1082 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1083 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1084 auto b = punyEncode(a); 1085 assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1086 } 1087 import std.stdio; 1088 } 1089 1090 unittest { 1091 { 1092 auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1093 assert(b == "ليهمابتكلموشعربي؟", b); 1094 } 1095 { 1096 assert(punyDecode("xn--maana-pta") == "mañana"); 1097 } 1098 } 1099 1100 unittest { 1101 import std.string, std.algorithm, std.array, std.range; 1102 { 1103 auto domain = "xn--m3h.xn--n3h.com"; 1104 auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1105 assert(decodedDomain == "☂.☃.com", decodedDomain); 1106 } 1107 { 1108 auto domain = "☂.☃.com"; 1109 auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1110 assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1111 } 1112 }