1 /** 2 * A URL handling library. 3 * 4 * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 * elements like port, path, username, and password. 6 * 7 * This module aims to make it simple to muck about with them. 8 * 9 * Example usage: 10 * --- 11 * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 * auto files = system("ssh", url.toString, "ls").splitLines; 13 * foreach (file; files) { 14 * system("scp", url ~ file, "."); 15 * } 16 * --- 17 * 18 * License: The MIT license. 19 */ 20 module url; 21 22 import std.algorithm; 23 import std.array; 24 import std.conv; 25 import std.encoding; 26 import std..string; 27 import std.utf; 28 29 @safe: 30 31 /// An exception thrown when something bad happens with URLs. 32 class URLException : Exception { 33 this(string msg) { super(msg); } 34 } 35 36 /** 37 * A mapping from schemes to their default ports. 38 * 39 * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 40 * use even if they use ports. Entries here should be treated as best guesses. 41 */ 42 ushort[string] schemeToDefaultPort; 43 44 static this() { 45 schemeToDefaultPort = [ 46 "aaa": 3868, 47 "aaas": 5658, 48 "acap": 674, 49 "amqp": 5672, 50 "cap": 1026, 51 "coap": 5683, 52 "coaps": 5684, 53 "dav": 443, 54 "dict": 2628, 55 "ftp": 21, 56 "git": 9418, 57 "go": 1096, 58 "gopher": 70, 59 "http": 80, 60 "https": 443, 61 "iac": 4569, 62 "icap": 1344, 63 "imap": 143, 64 "ipp": 631, 65 "ipps": 631, // yes, they're both mapped to port 631 66 "irc": 6667, // De facto default port, not the IANA reserved port. 67 "ircs": 6697, 68 "iris": 702, // defaults to iris.beep 69 "iris.beep": 702, 70 "iris.lwz": 715, 71 "iris.xpc": 713, 72 "iris.xpcs": 714, 73 "jabber": 5222, // client-to-server 74 "ldap": 389, 75 "ldaps": 636, 76 "msrp": 2855, 77 "msrps": 2855, 78 "mtqp": 1038, 79 "mupdate": 3905, 80 "news": 119, 81 "nfs": 2049, 82 "pop": 110, 83 "redis": 6379, 84 "reload": 6084, 85 "rsync": 873, 86 "rtmfp": 1935, 87 "rtsp": 554, 88 "shttp": 80, 89 "sieve": 4190, 90 "sip": 5060, 91 "sips": 5061, 92 "smb": 445, 93 "smtp": 25, 94 "snews": 563, 95 "snmp": 161, 96 "soap.beep": 605, 97 "ssh": 22, 98 "stun": 3478, 99 "stuns": 5349, 100 "svn": 3690, 101 "teamspeak": 9987, 102 "telnet": 23, 103 "tftp": 69, 104 "tip": 3372, 105 ]; 106 } 107 108 /** 109 * A Unique Resource Locator. 110 * 111 * URLs can be parsed (see parseURL) and implicitly convert to strings. 112 */ 113 struct URL { 114 /// The URL scheme. For instance, ssh, ftp, or https. 115 string scheme; 116 117 /// The username in this URL. Usually absent. If present, there will also be a password. 118 string user; 119 120 /// The password in this URL. Usually absent. 121 string pass; 122 123 /// The hostname. 124 string host; 125 126 /** 127 * The port. 128 * 129 * This is inferred from the scheme if it isn't present in the URL itself. 130 * If the scheme is not known and the port is not present, the port will be given as 0. 131 * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 132 * 133 * If you explicitly need to detect whether the user provided a port, check the providedPort 134 * field. 135 */ 136 @property ushort port() { 137 if (providedPort != 0) { 138 return providedPort; 139 } 140 if (auto p = scheme in schemeToDefaultPort) { 141 return *p; 142 } 143 return 0; 144 } 145 146 /** 147 * Set the port. 148 * 149 * This sets the providedPort field and is provided for convenience. 150 */ 151 @property ushort port(ushort value) { 152 return providedPort = value; 153 } 154 155 /// The port that was explicitly provided in the URL. 156 ushort providedPort; 157 158 /** 159 * The path. 160 * 161 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 162 * "/news/story/17774". 163 */ 164 string path; 165 166 /** 167 * The query string elements. 168 * 169 * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the query string 170 * elements will be ["visited": "false"]. 171 * 172 * Similarly, in the URL https://bbc.co.uk/news?item, the query string elements will be 173 * ["item": ""]. 174 * 175 * This field is mutable, so be cautious. 176 */ 177 string[string] query; 178 179 /** 180 * The fragment. In web documents, this typically refers to an anchor element. 181 * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 182 */ 183 string fragment; 184 185 /** 186 * Convert this URL to a string. 187 * The string is properly formatted and usable for, eg, a web request. 188 */ 189 string toString() { 190 return toString(false); 191 } 192 193 /** 194 * Convert this URL to a string. 195 * The string is intended to be human-readable rather than machine-readable. 196 */ 197 string toHumanReadableString() { 198 return toString(true); 199 } 200 201 private string toString(bool humanReadable) { 202 Appender!string s; 203 s ~= scheme; 204 s ~= "://"; 205 if (user) { 206 s ~= humanReadable ? user : user.percentEncode; 207 s ~= ":"; 208 s ~= humanReadable ? pass : pass.percentEncode; 209 s ~= "@"; 210 } 211 s ~= humanReadable ? host : host.toPuny; 212 if (providedPort) { 213 if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 214 s ~= ":"; 215 s ~= providedPort.to!string; 216 } 217 } 218 string p = path; 219 if (p.length == 0 || p == "/") { 220 s ~= '/'; 221 } else { 222 if (p[0] == '/') { 223 p = p[1..$]; 224 } 225 if (humanReadable) { 226 s ~= p; 227 } else { 228 foreach (part; p.split('/')) { 229 s ~= '/'; 230 s ~= part.percentEncode; 231 } 232 } 233 } 234 if (query) { 235 s ~= '?'; 236 bool first = true; 237 foreach (k, v; query) { 238 if (!first) { 239 s ~= '&'; 240 } 241 first = false; 242 s ~= k.percentEncode; 243 if (v.length > 0) { 244 s ~= '='; 245 s ~= v.percentEncode; 246 } 247 } 248 } 249 if (fragment) { 250 s ~= '#'; 251 s ~= fragment.percentEncode; 252 } 253 return s.data; 254 } 255 256 /// Implicitly convert URLs to strings. 257 alias toString this; 258 259 /** 260 * The append operator (~). 261 * 262 * The append operator for URLs returns a new URL with the given string appended as a path 263 * element to the URL's path. It only adds new path elements (or sequences of path elements). 264 * 265 * Don't worry about path separators; whether you include them or not, it will just work. 266 * 267 * Query elements are copied. 268 * 269 * Examples: 270 * --- 271 * auto random = "http://testdata.org/random".parseURL; 272 * auto randInt = random ~ "int"; 273 * writeln(randInt); // prints "http://testdata.org/random/int" 274 * --- 275 */ 276 URL opBinary(string op : "~")(string subsequentPath) { 277 URL other = this; 278 other ~= subsequentPath; 279 if (query) { 280 other.query = other.query.dup; 281 } 282 return other; 283 } 284 285 /** 286 * The append-in-place operator (~=). 287 * 288 * The append operator for URLs adds a path element to this URL. It only adds new path elements 289 * (or sequences of path elements). 290 * 291 * Don't worry about path separators; whether you include them or not, it will just work. 292 * 293 * Examples: 294 * --- 295 * auto random = "http://testdata.org/random".parseURL; 296 * random ~= "int"; 297 * writeln(random); // prints "http://testdata.org/random/int" 298 * --- 299 */ 300 URL opOpAssign(string op : "~")(string subsequentPath) { 301 if (path.endsWith("/") || subsequentPath.startsWith("/")) { 302 if (path.endsWith("/") && subsequentPath.startsWith("/")) { 303 path ~= subsequentPath[1..$]; 304 } else { 305 path ~= subsequentPath; 306 } 307 } else { 308 path ~= '/'; 309 path ~= subsequentPath; 310 } 311 return this; 312 } 313 } 314 315 /** 316 * Parse a URL from a string. 317 * 318 * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 319 * may be made. However, any URL in a correct format will be parsed correctly. 320 */ 321 bool tryParseURL(string value, out URL url) { 322 url = URL.init; 323 // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 324 // Scheme is optional in common use. We infer 'http' if it's not given. 325 auto i = value.indexOf("://"); 326 if (i > -1) { 327 url.scheme = value[0..i]; 328 value = value[i+3 .. $]; 329 } else { 330 url.scheme = "http"; 331 } 332 // [user:password@]host[:port]][/]path[?query][#fragment 333 i = value.indexOfAny([':', '/']); 334 if (i == -1) { 335 // Just a hostname. 336 url.host = value.fromPuny; 337 return true; 338 } 339 340 if (value[i] == ':') { 341 // This could be between username and password, or it could be between host and port. 342 auto j = value.indexOfAny(['@', '/']); 343 if (j > -1 && value[j] == '@') { 344 try { 345 url.user = value[0..i].percentDecode; 346 url.pass = value[i+1 .. j].percentDecode; 347 } catch (URLException) { 348 return false; 349 } 350 value = value[j+1 .. $]; 351 } 352 } 353 354 // It's trying to be a host/port, not a user/pass. 355 i = value.indexOfAny([':', '/']); 356 if (i == -1) { 357 url.host = value.fromPuny; 358 return true; 359 } 360 url.host = value[0..i].fromPuny; 361 value = value[i .. $]; 362 if (value[0] == ':') { 363 auto end = value.indexOf('/'); 364 if (end == -1) { 365 end = value.length; 366 } 367 try { 368 url.port = value[1 .. end].to!ushort; 369 } catch (ConvException) { 370 return false; 371 } 372 value = value[end .. $]; 373 if (value.length == 0) { 374 return true; 375 } 376 } 377 378 i = value.indexOfAny("?#"); 379 if (i == -1) { 380 url.path = value.percentDecode; 381 return true; 382 } 383 384 try { 385 url.path = value[0..i].percentDecode; 386 } catch (URLException) { 387 return false; 388 } 389 auto c = value[i]; 390 value = value[i + 1 .. $]; 391 if (c == '?') { 392 i = value.indexOf('#'); 393 string query; 394 if (i < 0) { 395 query = value; 396 value = null; 397 } else { 398 query = value[0..i]; 399 value = value[i + 1 .. $]; 400 } 401 auto queries = query.split('&'); 402 foreach (q; queries) { 403 auto j = q.indexOf('='); 404 try { 405 if (j == -1) { 406 url.query[q.percentDecode] = ""; 407 } else { 408 url.query[q[0..j].percentDecode] = q[j + 1 .. $].percentDecode; 409 } 410 } catch (URLException) { 411 return false; 412 } 413 } 414 } 415 416 try { 417 url.fragment = value.percentDecode; 418 } catch (URLException) { 419 return false; 420 } 421 422 return true; 423 } 424 425 /// 426 unittest { 427 { 428 // Basic. 429 URL url; 430 with (url) { 431 scheme = "https"; 432 host = "example.org"; 433 path = "/foo/bar"; 434 query["hello"] = "world"; 435 query["gibe"] = "clay"; 436 fragment = "frag"; 437 } 438 assert( 439 // Not sure what order it'll come out in. 440 url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 441 url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 442 url.toString); 443 } 444 { 445 // Percent encoded. 446 URL url; 447 with (url) { 448 scheme = "https"; 449 host = "example.org"; 450 path = "/f☃o"; 451 query["❄"] = "❀"; 452 query["["] = "]"; 453 fragment = "ş"; 454 } 455 assert( 456 // Not sure what order it'll come out in. 457 url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&%5B=%5D#%C5%9F" || 458 url.toString == "https://example.org/f%E2%98%83o?%5B=%5D&%E2%9D%84=%E2%9D%80#%C5%9F", 459 url.toString); 460 } 461 { 462 // Port, user, pass. 463 URL url; 464 with (url) { 465 scheme = "https"; 466 host = "example.org"; 467 user = "dhasenan"; 468 pass = "itsasecret"; 469 port = 17; 470 } 471 assert( 472 url.toString == "https://dhasenan:itsasecret@example.org:17/", 473 url.toString); 474 } 475 { 476 // Query with no path. 477 URL url; 478 with (url) { 479 scheme = "https"; 480 host = "example.org"; 481 query["hi"] = "bye"; 482 } 483 assert( 484 url.toString == "https://example.org/?hi=bye", 485 url.toString); 486 } 487 } 488 489 unittest { 490 // Percent decoding. 491 492 // http://#:!:@ 493 auto urlString = "http://%23:%21%3A@example.org/%7B/%7D?%3B&%26=%3D#%23hash"; 494 auto url = urlString.parseURL; 495 assert(url.user == "#"); 496 assert(url.pass == "!:"); 497 assert(url.host == "example.org"); 498 assert(url.path == "/{/}"); 499 assert(url.query[";"] == ""); 500 assert(url.query["&"] == "="); 501 assert(url.fragment == "#hash"); 502 503 // Round trip. 504 assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 505 assert(urlString == urlString.parseURL.toString.parseURL.toString); 506 } 507 508 unittest { 509 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 510 assert(url.host == "☂.☃.org", url.host); 511 } 512 513 unittest { 514 auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 515 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 516 assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 517 } 518 519 unittest { 520 auto url = "https://☂.☃.org/?hi=bye".parseURL; 521 assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 522 } 523 524 /// 525 unittest { 526 // There's an existing path. 527 auto url = parseURL("http://example.org/foo"); 528 // No slash? Assume it needs a slash. 529 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 530 // With slash? Don't add another. 531 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 532 url ~= "bar"; 533 assert(url.toString == "http://example.org/foo/bar"); 534 535 // Path already ends with a slash; don't add another. 536 url = parseURL("http://example.org/foo/"); 537 assert((url ~ "bar").toString == "http://example.org/foo/bar"); 538 // Still don't add one even if you're appending with a slash. 539 assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 540 url ~= "/bar"; 541 assert(url.toString == "http://example.org/foo/bar"); 542 543 // No path. 544 url = parseURL("http://example.org"); 545 assert((url ~ "bar").toString == "http://example.org/bar"); 546 assert((url ~ "/bar").toString == "http://example.org/bar"); 547 url ~= "bar"; 548 assert(url.toString == "http://example.org/bar"); 549 550 // Path is just a slash. 551 url = parseURL("http://example.org/"); 552 assert((url ~ "bar").toString == "http://example.org/bar"); 553 assert((url ~ "/bar").toString == "http://example.org/bar"); 554 url ~= "bar"; 555 assert(url.toString == "http://example.org/bar", url.toString); 556 557 // No path, just fragment. 558 url = "ircs://irc.freenode.com/#d".parseURL; 559 assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 560 } 561 562 unittest { 563 import std.net.curl; 564 auto url = "http://example.org".parseURL; 565 assert(is(typeof(std.net.curl.get(url)))); 566 } 567 568 /** 569 * Parse the input string as a URL. 570 * 571 * Throws: 572 * URLException if the string was in an incorrect format. 573 */ 574 URL parseURL(string value) { 575 URL url; 576 if (tryParseURL(value, url)) { 577 return url; 578 } 579 throw new URLException("failed to parse URL " ~ value); 580 } 581 582 /// 583 unittest { 584 { 585 // Infer scheme 586 auto u1 = parseURL("example.org"); 587 assert(u1.scheme == "http"); 588 assert(u1.host == "example.org"); 589 assert(u1.path == ""); 590 assert(u1.port == 80); 591 assert(u1.providedPort == 0); 592 assert(u1.fragment == ""); 593 } 594 { 595 // Simple host and scheme 596 auto u1 = parseURL("https://example.org"); 597 assert(u1.scheme == "https"); 598 assert(u1.host == "example.org"); 599 assert(u1.path == ""); 600 assert(u1.port == 443); 601 assert(u1.providedPort == 0); 602 } 603 { 604 // With path 605 auto u1 = parseURL("https://example.org/foo/bar"); 606 assert(u1.scheme == "https"); 607 assert(u1.host == "example.org"); 608 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 609 assert(u1.port == 443); 610 assert(u1.providedPort == 0); 611 } 612 { 613 // With explicit port 614 auto u1 = parseURL("https://example.org:1021/foo/bar"); 615 assert(u1.scheme == "https"); 616 assert(u1.host == "example.org"); 617 assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 618 assert(u1.port == 1021); 619 assert(u1.providedPort == 1021); 620 } 621 { 622 // With user 623 auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 624 assert(u1.scheme == "https"); 625 assert(u1.host == "example.org"); 626 assert(u1.path == "/foo/bar"); 627 assert(u1.port == 443); 628 assert(u1.user == "bob"); 629 assert(u1.pass == "secret"); 630 } 631 { 632 // With user, URL-encoded 633 auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 634 assert(u1.scheme == "https"); 635 assert(u1.host == "example.org"); 636 assert(u1.path == "/foo/bar"); 637 assert(u1.port == 443); 638 assert(u1.user == "bob!"); 639 assert(u1.pass == "secret!?"); 640 } 641 { 642 // With user and port and path 643 auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 644 assert(u1.scheme == "https"); 645 assert(u1.host == "example.org"); 646 assert(u1.path == "/foo/bar"); 647 assert(u1.port == 2210); 648 assert(u1.user == "bob"); 649 assert(u1.pass == "secret"); 650 assert(u1.fragment == ""); 651 } 652 { 653 // With query string 654 auto u1 = parseURL("https://example.org/?login=true"); 655 assert(u1.scheme == "https"); 656 assert(u1.host == "example.org"); 657 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 658 assert(u1.query["login"] == "true"); 659 assert(u1.fragment == ""); 660 } 661 { 662 // With query string and fragment 663 auto u1 = parseURL("https://example.org/?login=true#justkidding"); 664 assert(u1.scheme == "https"); 665 assert(u1.host == "example.org"); 666 assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 667 assert(u1.query["login"] == "true"); 668 assert(u1.fragment == "justkidding"); 669 } 670 { 671 // With URL-encoded values 672 auto u1 = parseURL("https://example.org/%E2%98%83?%E2%9D%84=%3D#%5E"); 673 assert(u1.scheme == "https"); 674 assert(u1.host == "example.org"); 675 assert(u1.path == "/☃", "expected path: /☃ actual path: " ~ u1.path); 676 assert(u1.query["❄"] == "="); 677 assert(u1.fragment == "^"); 678 } 679 } 680 681 unittest { 682 assert(parseURL("http://example.org").port == 80); 683 assert(parseURL("http://example.org:5326").port == 5326); 684 685 auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 686 assert(url.scheme == "redis"); 687 assert(url.user == "admin"); 688 assert(url.pass == "password"); 689 690 assert(parseURL("example.org").toString == "http://example.org/"); 691 assert(parseURL("http://example.org:80").toString == "http://example.org/"); 692 693 assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 694 } 695 696 /** 697 * Percent-encode a string. 698 * 699 * URL components cannot contain non-ASCII characters, and there are very few characters that are 700 * safe to include as URL components. Domain names using Unicode values use Punycode. For 701 * everything else, there is percent encoding. 702 */ 703 string percentEncode(string raw) { 704 // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 705 // We *can* encode any other characters. 706 // We *should not* encode alpha, numeric, or -._~. 707 Appender!string app; 708 foreach (dchar d; raw) { 709 if (('a' <= d && 'z' >= d) || 710 ('A' <= d && 'Z' >= d) || 711 ('0' <= d && '9' >= d) || 712 d == '-' || d == '.' || d == '_' || d == '~') { 713 app ~= d; 714 continue; 715 } 716 // Something simple like a space character? Still in 7-bit ASCII? 717 // Then we get a single-character string out of it and just encode 718 // that one bit. 719 // Something not in 7-bit ASCII? Then we percent-encode each octet 720 // in the UTF-8 encoding (and hope the server understands UTF-8). 721 char[] c; 722 encode(c, d); 723 auto bytes = cast(ubyte[])c; 724 foreach (b; bytes) { 725 app ~= format("%%%02X", b); 726 } 727 } 728 return cast(string)app.data; 729 } 730 731 /// 732 unittest { 733 assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 734 assert(percentEncode("~~--..__") == "~~--..__"); 735 assert(percentEncode("0123456789") == "0123456789"); 736 737 string e; 738 739 e = percentEncode("☃"); 740 assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 741 } 742 743 /** 744 * Percent-decode a string. 745 * 746 * URL components cannot contain non-ASCII characters, and there are very few characters that are 747 * safe to include as URL components. Domain names using Unicode values use Punycode. For 748 * everything else, there is percent encoding. 749 * 750 * This explicitly ensures that the result is a valid UTF-8 string. 751 */ 752 @trusted string percentDecode(string encoded) { 753 ubyte[] raw = percentDecodeRaw(encoded); 754 auto s = cast(string) raw; 755 if (!s.isValid) { 756 // TODO(dhasenan): 757 throw new URLException("input contains invalid UTF data"); 758 } 759 return s; 760 } 761 762 /// 763 unittest { 764 assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 765 assert(percentDecode("~~--..__") == "~~--..__"); 766 assert(percentDecode("0123456789") == "0123456789"); 767 768 string e; 769 770 e = percentDecode("%E2%98%83"); 771 assert(e == "☃", "expected a snowman but got" ~ e); 772 } 773 774 /** 775 * Percent-decode a string into a ubyte array. 776 * 777 * URL components cannot contain non-ASCII characters, and there are very few characters that are 778 * safe to include as URL components. Domain names using Unicode values use Punycode. For 779 * everything else, there is percent encoding. 780 * 781 * This yields a ubyte array and will not perform validation on the output. However, an improperly 782 * formatted input string will result in a URLException. 783 */ 784 ubyte[] percentDecodeRaw(string encoded) { 785 // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 786 Appender!(ubyte[]) app; 787 for (int i = 0; i < encoded.length; i++) { 788 if (encoded[i] != '%') { 789 app ~= encoded[i]; 790 continue; 791 } 792 if (i >= encoded.length - 2) { 793 throw new URLException("Invalid percent encoded value: expected two characters after " ~ 794 "percent symbol. Error at index " ~ i.to!string); 795 } 796 auto b = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 1])); 797 auto c = cast(ubyte)("0123456789ABCDEF".indexOf(encoded[i + 2])); 798 app ~= cast(ubyte)((b << 4) | c); 799 i += 2; 800 } 801 return app.data; 802 } 803 804 private string toPuny(string unicodeHostname) { 805 bool mustEncode = false; 806 foreach (i, dchar d; unicodeHostname) { 807 auto c = cast(uint) d; 808 if (c > 0x80) { 809 mustEncode = true; 810 break; 811 } 812 if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 813 throw new URLException( 814 format( 815 "domain name '%s' contains illegal character '%s' at position %s", 816 unicodeHostname, d, i)); 817 } 818 } 819 if (!mustEncode) { 820 return unicodeHostname; 821 } 822 return unicodeHostname.split('.').map!punyEncode.join("."); 823 } 824 825 private string fromPuny(string hostname) { 826 return hostname.split('.').map!punyDecode.join("."); 827 } 828 829 private { 830 enum delimiter = '-'; 831 enum marker = "xn--"; 832 enum ulong damp = 700; 833 enum ulong tmin = 1; 834 enum ulong tmax = 26; 835 enum ulong skew = 38; 836 enum ulong base = 36; 837 enum ulong initialBias = 72; 838 enum dchar initialN = cast(dchar)128; 839 840 ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 841 if (firstTime) { 842 delta /= damp; 843 } else { 844 delta /= 2; 845 } 846 delta += delta / numPoints; 847 ulong k = 0; 848 while (delta > ((base - tmin) * tmax) / 2) { 849 delta /= (base - tmin); 850 k += base; 851 } 852 return k + (((base - tmin + 1) * delta) / (delta + skew)); 853 } 854 } 855 856 /** 857 * Encode the input string using the Punycode algorithm. 858 * 859 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 860 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 861 * in Punycode, you will get "xn--m3h.xn--n3h.com". 862 * 863 * In order to puny-encode a domain name, you must split it into its components. The following will 864 * typically suffice: 865 * --- 866 * auto domain = "☂.☃.com"; 867 * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 868 * --- 869 */ 870 string punyEncode(string input) { 871 ulong delta = 0; 872 dchar n = initialN; 873 auto i = 0; 874 auto bias = initialBias; 875 Appender!string output; 876 output ~= marker; 877 auto pushed = 0; 878 auto codePoints = 0; 879 foreach (dchar c; input) { 880 codePoints++; 881 if (c <= initialN) { 882 output ~= c; 883 pushed++; 884 } 885 } 886 if (pushed < codePoints) { 887 if (pushed > 0) { 888 output ~= delimiter; 889 } 890 } else { 891 // No encoding to do. 892 return input; 893 } 894 bool first = true; 895 while (pushed < codePoints) { 896 auto best = dchar.max; 897 foreach (dchar c; input) { 898 if (n <= c && c < best) { 899 best = c; 900 } 901 } 902 if (best == dchar.max) { 903 throw new URLException("failed to find a new codepoint to process during punyencode"); 904 } 905 delta += (best - n) * (pushed + 1); 906 if (delta > uint.max) { 907 // TODO better error message 908 throw new URLException("overflow during punyencode"); 909 } 910 n = best; 911 foreach (dchar c; input) { 912 if (c < n) { 913 delta++; 914 } 915 if (c == n) { 916 ulong q = delta; 917 auto k = base; 918 while (true) { 919 ulong t; 920 if (k <= bias) { 921 t = tmin; 922 } else if (k >= bias + tmax) { 923 t = tmax; 924 } else { 925 t = k - bias; 926 } 927 if (q < t) { 928 break; 929 } 930 output ~= digitToBasic(t + ((q - t) % (base - t))); 931 q = (q - t) / (base - t); 932 k += base; 933 } 934 output ~= digitToBasic(q); 935 pushed++; 936 bias = adapt(delta, pushed, first); 937 first = false; 938 delta = 0; 939 } 940 } 941 delta++; 942 n++; 943 } 944 return cast(string)output.data; 945 } 946 947 /** 948 * Decode the input string using the Punycode algorithm. 949 * 950 * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 951 * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 952 * in Punycode, you will get "xn--m3h.xn--n3h.com". 953 * 954 * In order to puny-decode a domain name, you must split it into its components. The following will 955 * typically suffice: 956 * --- 957 * auto domain = "xn--m3h.xn--n3h.com"; 958 * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 959 * --- 960 */ 961 string punyDecode(string input) { 962 if (!input.startsWith(marker)) { 963 return input; 964 } 965 input = input[marker.length..$]; 966 967 // let n = initial_n 968 dchar n = cast(dchar)128; 969 970 // let i = 0 971 // let bias = initial_bias 972 // let output = an empty string indexed from 0 973 ulong i = 0; 974 auto bias = initialBias; 975 dchar[] output; 976 // This reserves a bit more than necessary, but it should be more efficient overall than just 977 // appending and inserting volo-nolo. 978 output.reserve(input.length); 979 980 // consume all code points before the last delimiter (if there is one) 981 // and copy them to output, fail on any non-basic code point 982 // if more than zero code points were consumed then consume one more 983 // (which will be the last delimiter) 984 auto end = input.lastIndexOf(delimiter); 985 if (end > -1) { 986 foreach (dchar c; input[0..end]) { 987 output ~= c; 988 } 989 input = input[end+1 .. $]; 990 } 991 992 // while the input is not exhausted do begin 993 ulong pos = 0; 994 while (pos < input.length) { 995 // let oldi = i 996 // let w = 1 997 auto oldi = i; 998 auto w = 1; 999 // for k = base to infinity in steps of base do begin 1000 for (ulong k = base; k < uint.max; k += base) { 1001 // consume a code point, or fail if there was none to consume 1002 // Note that the input is all ASCII, so we can simply index the input string bytewise. 1003 auto c = input[pos]; 1004 pos++; 1005 // let digit = the code point's digit-value, fail if it has none 1006 auto digit = basicToDigit(c); 1007 // let i = i + digit * w, fail on overflow 1008 i += digit * w; 1009 // let t = tmin if k <= bias {+ tmin}, or 1010 // tmax if k >= bias + tmax, or k - bias otherwise 1011 ulong t; 1012 if (k <= bias) { 1013 t = tmin; 1014 } else if (k >= bias + tmax) { 1015 t = tmax; 1016 } else { 1017 t = k - bias; 1018 } 1019 // if digit < t then break 1020 if (digit < t) { 1021 break; 1022 } 1023 // let w = w * (base - t), fail on overflow 1024 w *= (base - t); 1025 // end 1026 } 1027 // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1028 bias = adapt(i - oldi, output.length + 1, oldi == 0); 1029 // let n = n + i div (length(output) + 1), fail on overflow 1030 n += i / (output.length + 1); 1031 // let i = i mod (length(output) + 1) 1032 i %= (output.length + 1); 1033 // {if n is a basic code point then fail} 1034 // (We aren't actually going to fail here; it's clear what this means.) 1035 // insert n into output at position i 1036 (() @trusted { output.insertInPlace(i, cast(dchar)n); })(); // should be @safe but isn't marked 1037 // increment i 1038 i++; 1039 // end 1040 } 1041 return output.to!string; 1042 } 1043 1044 // Lifted from punycode.js. 1045 private dchar digitToBasic(ulong digit) { 1046 return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1047 } 1048 1049 // Lifted from punycode.js. 1050 private uint basicToDigit(char c) { 1051 auto codePoint = cast(uint)c; 1052 if (codePoint - 48 < 10) { 1053 return codePoint - 22; 1054 } 1055 if (codePoint - 65 < 26) { 1056 return codePoint - 65; 1057 } 1058 if (codePoint - 97 < 26) { 1059 return codePoint - 97; 1060 } 1061 return base; 1062 } 1063 1064 unittest { 1065 { 1066 auto a = "b\u00FCcher"; 1067 assert(punyEncode(a) == "xn--bcher-kva"); 1068 } 1069 { 1070 auto a = "b\u00FCc\u00FCher"; 1071 assert(punyEncode(a) == "xn--bcher-kvab"); 1072 } 1073 { 1074 auto a = "ýbücher"; 1075 auto b = punyEncode(a); 1076 assert(b == "xn--bcher-kvaf", b); 1077 } 1078 1079 { 1080 auto a = "mañana"; 1081 assert(punyEncode(a) == "xn--maana-pta"); 1082 } 1083 1084 { 1085 auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1086 ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1087 auto b = punyEncode(a); 1088 assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1089 } 1090 import std.stdio; 1091 } 1092 1093 unittest { 1094 { 1095 auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1096 assert(b == "ليهمابتكلموشعربي؟", b); 1097 } 1098 { 1099 assert(punyDecode("xn--maana-pta") == "mañana"); 1100 } 1101 } 1102 1103 unittest { 1104 import std..string, std.algorithm, std.array, std.range; 1105 { 1106 auto domain = "xn--m3h.xn--n3h.com"; 1107 auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1108 assert(decodedDomain == "☂.☃.com", decodedDomain); 1109 } 1110 { 1111 auto domain = "☂.☃.com"; 1112 auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1113 assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1114 } 1115 }