Various UrlParser improvements

See gh-32513
This commit is contained in:
Arjen Poutsma 2024-04-25 14:11:57 +02:00
parent 27d2200058
commit 3cfa4ed4f7
2 changed files with 515 additions and 107 deletions

View File

@ -657,12 +657,6 @@ final class HierarchicalUriComponents extends UriComponents {
public boolean isAllowed(int c) {
return isUnreserved(c);
}
},
C0 {
@Override
public boolean isAllowed(int c) {
return !(c >= 0 && c <= 0x1f) && !(c > '~');
}
};
/**

View File

@ -23,9 +23,10 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.StringTokenizer;
import java.util.function.Consumer;
import java.util.function.IntPredicate;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -234,23 +235,70 @@ final class UrlParser {
this.state = newState;
}
private static List<String> tokenize(String str, String delimiters) {
StringTokenizer st = new StringTokenizer(str, delimiters);
private static List<String> strictSplit(String input, int delimiter) {
// Let position be a position variable for input, initially pointing at the start of input.
int position = 0;
// Let tokens be a list of strings, initially empty.
List<String> tokens = new ArrayList<>();
while (st.hasMoreTokens()) {
tokens.add(st.nextToken());
// Let token be the result of collecting a sequence of code points that are not equal to delimiter from input, given position.
int delIdx = input.indexOf(delimiter, position);
String token = (delIdx != EOF) ? input.substring(position, delIdx) : input.substring(position);
position = delIdx;
// Append token to tokens.
tokens.add(token);
// While position is not past the end of input:
while (position != EOF) {
// Assert: the code point at position within input is delimiter.
Assert.state(input.codePointAt(position) == delimiter, "Codepoint is not a delimiter");
// Advance position by 1.
position++;
delIdx = input.indexOf(delimiter, position);
// Let token be the result of collecting a sequence of code points that are not equal to delimiter from input, given position.
token = (delIdx != EOF) ? input.substring(position, delIdx) : input.substring(position);
position = delIdx;
// Append token to tokens.
tokens.add(token);
}
return tokens;
}
private static String domainToAscii(String domain, boolean beStrict) {
// If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.) does not produce any item that starts with an ASCII case-insensitive match for "xn--", this step is equivalent to ASCII lowercasing domain.
boolean onlyLowerCase = !beStrict;
if (!beStrict && containsOnlyAscii(domain)) {
int dotIdx = domain.indexOf('.');
while (dotIdx != -1) {
if (domain.length() - dotIdx > 4) {
// ASCII case-insensitive match for "xn--"
char ch0 = domain.charAt(dotIdx + 1);
char ch1 = domain.charAt(dotIdx + 2);
char ch2 = domain.charAt(dotIdx + 3);
char ch3 = domain.charAt(dotIdx + 4);
if ((ch0 == 'x' || ch0 == 'X') &&
(ch1 == 'n' || ch1 == 'N') &&
ch2 == '-' && ch3 == '_') {
onlyLowerCase = false;
break;
}
}
dotIdx = domain.indexOf('.', dotIdx + 1);
}
}
if (onlyLowerCase) {
return domain.toLowerCase(Locale.ENGLISH);
}
// Let result be the result of running Unicode ToASCII (https://www.unicode.org/reports/tr46/#ToASCII) with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46]
int flag = 0;
if (beStrict) {
flag |= IDN.USE_STD3_ASCII_RULES;
}
// Implementation note: implementing Unicode ToASCII is beyond the scope of this parser, we use java.net.IDN.toASCII
return IDN.toASCII(domain, flag);
try {
return IDN.toASCII(domain, flag);
}
catch (IllegalArgumentException ex) {
throw new InvalidUrlException("Could not convert \"" + domain + "\" to ASCII: " + ex.getMessage(), ex);
}
}
private boolean validate() {
@ -284,6 +332,50 @@ final class UrlParser {
throw new InvalidUrlException(message.toString());
}
/**
* The C0 control percent-encode set are the C0 controls and all code points greater than U+007E (~).
*/
private static boolean c0ControlPercentEncodeSet(int ch) {
return isC0Control(ch) || Integer.compareUnsigned(ch, '~') > 0;
}
/**
* The fragment percent-encode set is the C0 control percent-encode set and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
*/
private static boolean fragmentPercentEncodeSet(int ch) {
return c0ControlPercentEncodeSet(ch) || ch == ' ' || ch == '"' || ch == '<' || ch == '>' || ch == '`';
}
/**
* The query percent-encode set is the C0 control percent-encode set and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).
*/
private static boolean queryPercentEncodeSet(int ch) {
return c0ControlPercentEncodeSet(ch) || ch == ' ' || ch == '"' || ch == '#' || ch == '<' || ch == '>';
}
/**
* The special-query percent-encode set is the query percent-encode set and U+0027 (').
*/
private static boolean specialQueryPercentEncodeSet(int ch) {
return queryPercentEncodeSet(ch) || ch == '\'';
}
/**
* The path percent-encode set is the query percent-encode set and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).
*/
private static boolean pathPercentEncodeSet(int ch) {
return queryPercentEncodeSet(ch) || ch == '?' || ch == '`' || ch == '{' || ch == '}';
}
/**
* The userinfo percent-encode set is the path percent-encode set and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@), U+005B ([) to U+005E (^), inclusive, and U+007C (|).
*/
private static boolean userinfoPercentEncodeSet(int ch) {
return pathPercentEncodeSet(ch) || ch == '/' || ch == ':' || ch == ';' || ch == '=' || ch == '@' ||
(Integer.compareUnsigned(ch, '[') >= 0 && Integer.compareUnsigned(ch, '^') <= 0) || ch == '|';
}
private static boolean isC0Control(int ch) {
return ch >= 0 && ch <= 0x1F;
}
@ -307,6 +399,21 @@ final class UrlParser {
return true;
}
private static boolean containsOnlyAscii(CharSequence string) {
for (int i=0; i< string.length(); i++ ) {
char ch = string.charAt(i);
if (!isAsciiCodePoint(ch)) {
return false;
}
}
return true;
}
private static boolean isAsciiCodePoint(int ch) {
// An ASCII code point is a code point in the range U+0000 NULL to U+007F DELETE, inclusive.
return Integer.compareUnsigned(ch, 0) >= 0 && Integer.compareUnsigned(ch, 127) <= 0;
}
private static boolean isAsciiDigit(int ch) {
return (ch >= '0' && ch <= '9');
}
@ -400,76 +507,151 @@ final class UrlParser {
}
}
private String percentEncode(int c, HierarchicalUriComponents.Type type) {
return percentEncode(Character.toString(c), type);
private static String percentDecode(String input) {
try {
return UriUtils.decode(input, StandardCharsets.UTF_8);
}
catch (IllegalArgumentException ex) {
throw new InvalidUrlException("Could not decode \"" + input + "\": " + ex.getMessage(), ex);
}
}
private String percentEncode(String source, HierarchicalUriComponents.Type type) {
if (this.encoding != null) {
return HierarchicalUriComponents.encodeUriComponent(source, this.encoding, type);
private String percentEncode(int c, IntPredicate percentEncodeSet) {
return percentEncode(Character.toString(c), percentEncodeSet);
}
private String percentEncode(String input, IntPredicate percentEncodeSet) {
if (this.encoding == null) {
return input;
}
else {
return source;
byte[] bytes = input.getBytes(this.encoding);
boolean original = true;
for (byte b : bytes) {
if (percentEncodeSet.test(b)) {
original = false;
break;
}
}
if (original) {
return input;
}
StringBuilder output = new StringBuilder();
for (byte b : bytes) {
if (!percentEncodeSet.test(b)) {
output.append((char)b);
}
else {
output.append('%');
char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16));
char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16));
output.append(hex1);
output.append(hex2);
}
}
return output.toString();
}
}
/**
* A single-dot URL path segment is a URL path segment that is "." or an ASCII case-insensitive match for "%2e".
* A single-dot URL path segment is a URL path segment that is "[/]." or an ASCII case-insensitive match for "[/]%2e".
*/
private static boolean isSingleDotPathSegment(StringBuilder b) {
int len = b.length();
if (len == 1) {
char ch0 = b.charAt(0);
return ch0 == '.';
}
else if (len == 3) {
// ASCII case-insensitive match for "%2e".
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
return ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E');
}
else {
return false;
switch (len) {
case 1 -> {
char ch0 = b.charAt(0);
return ch0 == '.';
}
case 2 -> {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
return ch0 == '/' && ch1 == '.';
}
case 3 -> {
// ASCII case-insensitive match for "%2e".
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
return ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E');
}
case 4 -> {
// ASCII case-insensitive match for "/%2e".
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
char ch3 = b.charAt(3);
return ch0 == '/' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E');
}
default -> {
return false;
}
}
}
/**
* A double-dot URL path segment is a URL path segment that is "/.." or an ASCII case-insensitive match for "/.%2e", "/%2e.", or "/%2e%2e".
* A double-dot URL path segment is a URL path segment that is "[/].." or an ASCII case-insensitive match for "/.%2e", "/%2e.", or "/%2e%2e".
*/
private static boolean isDoubleDotPathSegment(StringBuilder b) {
int len = b.length();
if (len == 3) {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
return ch0 == '/' && ch1 == '.' && ch2 == '.';
}
else if (len == 5) {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
char ch3 = b.charAt(3);
char ch4 = b.charAt(4);
// case-insensitive match for "/.%2e" or "/%2e."
return ch0 == '/' &&
(ch1 == '.' && ch2 == '%' && ch3 == '2' && (ch4 == 'e' || ch4 == 'E')
|| (ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') && ch4 == '.'));
}
else if (len == 7) {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
char ch3 = b.charAt(3);
char ch4 = b.charAt(4);
char ch5 = b.charAt(5);
char ch6 = b.charAt(6);
// case-insensitive match for "/%2e%2e".
return ch0 == '/' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E')
&& ch4 == '%' && ch5 == '2' && (ch6 == 'e' || ch6 == 'E');
}
else {
return false;
switch (len) {
case 2 -> {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
return ch0 == '.' && ch1 == '.';
}
case 3 -> {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
return ch0 == '/' && ch1 == '.' && ch2 == '.';
}
case 4 -> {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
char ch3 = b.charAt(3);
// case-insensitive match for ".%2e" or "%2e."
return (ch0 == '.' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') ||
(ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E') && ch3 == '.'));
}
case 5 -> {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
char ch3 = b.charAt(3);
char ch4 = b.charAt(4);
// case-insensitive match for "/.%2e" or "/%2e."
return ch0 == '/' &&
(ch1 == '.' && ch2 == '%' && ch3 == '2' && (ch4 == 'e' || ch4 == 'E')
|| (ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') && ch4 == '.'));
}
case 6 -> {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
char ch3 = b.charAt(3);
char ch4 = b.charAt(4);
char ch5 = b.charAt(5);
// case-insensitive match for "%2e%2e".
return ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E')
&& ch3 == '%' && ch4 == '2' && (ch5 == 'e' || ch5 == 'E');
}
case 7 -> {
char ch0 = b.charAt(0);
char ch1 = b.charAt(1);
char ch2 = b.charAt(2);
char ch3 = b.charAt(3);
char ch4 = b.charAt(4);
char ch5 = b.charAt(5);
char ch6 = b.charAt(6);
// case-insensitive match for "/%2e%2e".
return ch0 == '/' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E')
&& ch4 == '%' && ch5 == '2' && (ch6 == 'e' || ch6 == 'E');
}
default -> {
return false;
}
}
}
@ -656,8 +838,8 @@ final class UrlParser {
// If base is null, or base has an opaque path and c is not U+0023 (#), missing-scheme-non-relative-URL
// validation error, return failure.
if (p.base == null || p.base.path().isOpaque() && c != '#') {
p.failure("The input is missing a scheme, because it does not begin with an ASCII alpha \"" + Character.toString(c) +
"\", and no base URL was provided.");
p.failure("The input is missing a scheme, because it does not begin with an ASCII alpha \"" +
(c != EOF ? Character.toString(c) : "") + "\", and no base URL was provided.");
}
// Otherwise, if base has an opaque path and c is U+0023 (#), set urls scheme to bases scheme, urls
// path to bases path, urls query to bases query, urls fragment to the empty string, and set state to fragment state.
@ -738,8 +920,8 @@ final class UrlParser {
else {
// Set urls username to bases username, urls password to bases password, urls host to bases host,
// urls port to bases port, urls path to a clone of bases path, and urls query to bases query.
url.username = p.base.username();
url.password = p.base.password();
url.username.replace(0, url.username.length(), p.base.username());
url.password.replace(0, url.password.length(), p.base.password());
url.host = p.base.host();
url.port = p.base.port();
url.path = p.base.path().clone();
@ -789,8 +971,8 @@ final class UrlParser {
// to bases host, urls port to bases port, state to path state, and then, decrease pointer by 1.
else {
Assert.state(p.base != null, "No base URL available");
url.username = p.base.username();
url.password = p.base.password();
url.username.replace(0, url.username.length(), p.base.username());
url.password.replace(0, url.password.length(), p.base.password());
url.host = p.base.host();
url.port = p.base.port();
p.setState(PATH);
@ -850,9 +1032,6 @@ final class UrlParser {
p.atSignSeen = true;
int bufferLen = p.buffer.length();
StringBuilder username = new StringBuilder(bufferLen);
StringBuilder password = new StringBuilder(bufferLen);
// For each codePoint in buffer:
for (int i = 0; i < bufferLen; i++) {
int codePoint = p.buffer.codePointAt(i);
@ -862,18 +1041,16 @@ final class UrlParser {
continue;
}
// Let encodedCodePoints be the result of running UTF-8 percent-encode codePoint using the userinfo percent-encode set.
String encodedCodePoints = p.percentEncode(codePoint,HierarchicalUriComponents.Type.USER_INFO);
String encodedCodePoints = p.percentEncode(codePoint, UrlParser::userinfoPercentEncodeSet);
// If passwordTokenSeen is true, then append encodedCodePoints to urls password.
if (p.passwordTokenSeen) {
password.append(encodedCodePoints);
url.password.append(encodedCodePoints);
}
// Otherwise, append encodedCodePoints to urls username.
else {
username.append(encodedCodePoints);
url.username.append(encodedCodePoints);
}
}
url.username = username.toString();
url.password = password.toString();
// Set buffer to the empty string.
p.emptyBuffer();
}
@ -917,7 +1094,7 @@ final class UrlParser {
return;
}
// Let host be the result of host parsing buffer with url is not special.
Host host = Host.parse(p.buffer.toString(), false, p.validationErrorHandler);
Host host = Host.parse(p.buffer.toString(), !url.isSpecial(), p);
// Set urls host to host, buffer to the empty string, and state to port state.
url.host = host;
p.emptyBuffer();
@ -943,7 +1120,7 @@ final class UrlParser {
// EXTRA: if buffer is not empty
if (!p.buffer.isEmpty()) {
// Let host be the result of host parsing buffer with url is not special.
Host host = Host.parse(p.buffer.toString(), false, p.validationErrorHandler);
Host host = Host.parse(p.buffer.toString(), !url.isSpecial(), p);
// Set urls host to host, buffer to the empty string, and state to path start state.
url.host = host;
}
@ -1005,7 +1182,7 @@ final class UrlParser {
}
int defaultPort = defaultPort(url.scheme);
// Set urls port to null, if port is urls schemes default port; otherwise to port.
if (defaultPort == -1 || port == defaultPort) {
if (defaultPort != -1 && port == defaultPort) {
url.port = null;
}
else {
@ -1160,7 +1337,7 @@ final class UrlParser {
// Otherwise, run these steps:
else {
// Let host be the result of host parsing buffer with url is not special.
Host host = Host.parse(p.buffer.toString(), false, p.validationErrorHandler);
Host host = Host.parse(p.buffer.toString(), !url.isSpecial(), p);
// If host is "localhost", then set host to the empty string.
if (host instanceof Domain domain && domain.domain().equals("localhost")) {
host = EmptyHost.INSTANCE;
@ -1308,7 +1485,7 @@ final class UrlParser {
}
}
// UTF-8 percent-encode c using the path percent-encode set and append the result to buffer.
String encoded = p.percentEncode(c, HierarchicalUriComponents.Type.PATH_SEGMENT);
String encoded = p.percentEncode(c, UrlParser::pathPercentEncodeSet);
p.buffer.append(encoded);
}
}
@ -1353,7 +1530,7 @@ final class UrlParser {
}
// If c is not the EOF code point, UTF-8 percent-encode c using the C0 control percent-encode set and append the result to urls path.
if (c != EOF) {
String encoded = p.percentEncode(c, HierarchicalUriComponents.Type.C0);
String encoded = p.percentEncode(c, UrlParser::c0ControlPercentEncodeSet);
url.path.append(encoded);
}
}
@ -1376,8 +1553,9 @@ final class UrlParser {
// - c is the EOF code point
if ( (p.stateOverride == null && c == '#') || c == EOF) {
// Let queryPercentEncodeSet be the special-query percent-encode set if url is special; otherwise the query percent-encode set.
IntPredicate queryPercentEncodeSet = url.isSpecial() ? UrlParser::specialQueryPercentEncodeSet : UrlParser::queryPercentEncodeSet;
// Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to urls query.
String encoded = p.percentEncode(p.buffer.toString(), HierarchicalUriComponents.Type.QUERY);
String encoded = p.percentEncode(p.buffer.toString(), queryPercentEncodeSet);
Assert.state(url.query != null, "Url's query should not be null");
url.query += encoded;
// Set buffer to the empty string.
@ -1432,7 +1610,7 @@ final class UrlParser {
}
}
// UTF-8 percent-encode c using the fragment percent-encode set and append the result to urls fragment.
String encoded = p.percentEncode(c, HierarchicalUriComponents.Type.FRAGMENT);
String encoded = p.percentEncode(c, UrlParser::fragmentPercentEncodeSet);
Assert.state(url.fragment != null, "Url's fragment should not be null");
url.fragment += encoded;
}
@ -1472,9 +1650,9 @@ final class UrlParser {
private String scheme = "";
private String username = "";
private StringBuilder username = new StringBuilder();
private String password = "";
private StringBuilder password = new StringBuilder();
@Nullable
private Host host = null;
@ -1517,6 +1695,33 @@ final class UrlParser {
}
/**
* The serialization of an origin is the string obtained by applying the following algorithm to the given origin origin:
* If origin is an opaque origin, then return "null".
* Otherwise, let result be origin's scheme.
* Append "://" to result.
* Append origin's host, serialized, to result.
* If origin's port is non-null, append a U+003A COLON character (:), and origin's port, serialized, to result.
* Return result.
*/
public String origin() {
String scheme = scheme();
if (scheme.equals("ftp") || scheme.equals("http") || scheme.equals("https") || scheme.equals("ws") || scheme.equals("wss")) {
StringBuilder builder = new StringBuilder(scheme);
builder.append("://");
builder.append(host());
Port port = port();
if (port != null) {
builder.append(':');
builder.append(port);
}
return builder.toString();
}
else {
return "null";
}
}
/**
* A URLs scheme is an ASCII string that identifies the type of URL and can be used to dispatch a URL for
* further processing after parsing. It is initially the empty string.
@ -1525,18 +1730,25 @@ final class UrlParser {
return this.scheme;
}
/**
* The protocol getter steps are to return thiss URLs scheme, followed by U+003A (:).
*/
public String protocol() {
return scheme() + ":";
}
/**
* A URLs username is an ASCII string identifying a username. It is initially the empty string.
*/
public String username() {
return this.username;
return this.username.toString();
}
/**
* A URLs password is an ASCII string identifying a password. It is initially the empty string.
*/
public String password() {
return this.password;
return this.password.toString();
}
/**
@ -1547,6 +1759,36 @@ final class UrlParser {
return this.host;
}
/**
*The host getter steps are:
* Let url be thiss URL.
* If urls host is null, then return the empty string.
* If urls port is null, return urls host, serialized.
* Return urls host, serialized, followed by U+003A (:) and urls port, serialized.
*/
public String hostString() {
if (host() == null) {
return "";
}
StringBuilder builder = new StringBuilder(hostname());
Port port = port();
if (port != null) {
builder.append(':');
builder.append(port);
}
return builder.toString();
}
public String hostname() {
Host host = host();
if (host == null) {
return "";
}
else {
return host.toString();
}
}
/**
* A URLs port is either null, a string representing a 16-bit unsigned integer that identifies a networking
* port, or a string containing a uri template . It is initially {@code null}.
@ -1556,6 +1798,15 @@ final class UrlParser {
return this.port;
}
public String portString() {
if (port() == null) {
return "";
}
else {
return port().toString();
}
}
/**
* A URLs path is a URL {@linkplain Path path}, usually identifying a location. It is initially {@code « »}.
*/
@ -1563,6 +1814,10 @@ final class UrlParser {
return this.path;
}
public String pathname() {
return path().name();
}
/**
* To shorten a urls path:
* <ol>
@ -1585,6 +1840,21 @@ final class UrlParser {
return this.query;
}
/**
* The search getter steps are:
* If thiss URLs query is either null or the empty string, then return the empty string.
* Return U+003F (?), followed by thiss URLs query.
*/
public String search() {
String query = query();
if (query == null) {
return "";
}
else {
return "?" + query;
}
}
/**
* A URLs fragment is either {@code null} or an ASCII string that can be used for further processing on the
* resource the URLs other components identify. It is initially {@code null}.
@ -1594,6 +1864,77 @@ final class UrlParser {
return this.fragment;
}
/**
* The hash getter steps are:
* If thiss URLs fragment is either null or the empty string, then return the empty string.
* Return U+0023 (#), followed by thiss URLs fragment.
*/
public String hash() {
String fragment = fragment();
if (fragment == null || fragment.isEmpty()) {
return "";
}
else {
return "#" + fragment;
}
}
public String href() {
// Let output be urls scheme and U+003A (:) concatenated.
StringBuilder output = new StringBuilder(scheme());
output.append(':');
Host host = host();
// If urls host is non-null:
if (host != null) {
// Append "//" to output.
output.append("//");
// If url includes credentials, then:
if (includesCredentials()) {
// Append urls username to output.
output.append(username());
String password = password();
// If urls password is not the empty string, then append U+003A (:), followed by urls password, to output.
if (!password.isEmpty()) {
output.append(':');
output.append(password);
}
// Append U+0040 (@) to output.
output.append('@');
}
// Append urls host, serialized, to output.
output.append(hostname());
Port port = port();
// If urls port is non-null, append U+003A (:) followed by urls port, serialized, to output.
if (port != null) {
output.append(':');
output.append(port());
}
}
// If urls host is null, url does not have an opaque path, urls paths size is greater than 1, and urls path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
else if (!hasOpaquePath() &&
path() instanceof PathSegments pathSegments &&
pathSegments.size() > 1 &&
pathSegments.get(0).isEmpty()) {
output.append("/.");
}
// Append the result of URL path serializing url to output.
output.append(pathname());
// If urls query is non-null, append U+003F (?), followed by urls query, to output.
String query = query();
if (query != null) {
output.append('?');
output.append(query);
}
// If exclude fragment is false and urls fragment is non-null, then append U+0023 (#), followed by urls fragment, to output.
String fragment = fragment();
if (fragment != null) {
output.append('#');
output.append(fragment);
}
// Return output.
return output.toString();
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
@ -1646,7 +1987,7 @@ final class UrlParser {
* The host parser takes a scalar value string input with an optional
* boolean isOpaque (default false), and then runs these steps. They return failure or a host.
*/
static Host parse(String input, boolean isOpaque, @Nullable Consumer<String> validationErrorHandler) {
static Host parse(String input, boolean isOpaque, UrlParser p) {
// If input starts with U+005B ([), then:
if (!input.isEmpty() && input.charAt(0) == '[') {
int last = input.length() - 1;
@ -1660,13 +2001,13 @@ final class UrlParser {
}
// If isOpaque is true, then return the result of opaque-host parsing input.
if (isOpaque) {
return OpaqueHost.parse(input);
return OpaqueHost.parse(input, p);
}
// Assert: input is not the empty string.
Assert.state(!input.isEmpty(), "Input should not be empty");
// Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
String domain = UriUtils.decode(input, StandardCharsets.UTF_8);
String domain = percentDecode(input);
// Let asciiDomain be the result of running domain to ASCII with domain and false.
String asciiDomain = domainToAscii(domain, false);
@ -1679,7 +2020,7 @@ final class UrlParser {
}
// If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain.
if (endsInNumber(asciiDomain)) {
Ipv4Address address = Ipv4Address.parse(asciiDomain, validationErrorHandler);
Ipv4Address address = Ipv4Address.parse(asciiDomain, p);
return new IpAddressHost(address);
}
// Return asciiDomain.
@ -1690,8 +2031,11 @@ final class UrlParser {
private static boolean endsInNumber(String input) {
// Let parts be the result of strictly splitting input on U+002E (.).
List<String> parts = tokenize(input, ".");
List<String> parts = strictSplit(input, '.');
int lastIdx = parts.size() - 1;
if (lastIdx == -1) {
return false;
}
// If the last item in parts is the empty string, then:
if (parts.get(lastIdx).isEmpty()) {
// If partss size is 1, then return false.
@ -1807,11 +2151,62 @@ final class UrlParser {
}
}
record OpaqueHost(String domain) implements Host {
static final class OpaqueHost implements Host {
public static OpaqueHost parse(String input) {
throw new UnsupportedOperationException("Not implemented yet");
private final String host;
private OpaqueHost(String host) {
this.host = host;
}
/**
* The opaque-host parser takes a scalar value string input, and then runs these steps. They return failure or
* an opaque host.
*/
public static OpaqueHost parse(String input, UrlParser p) {
for (int i = 0; i < input.length(); i++) {
char ch = input.charAt(i);
// If input contains a forbidden host code point, host-invalid-code-point validation error, return failure.
if (isForbiddenHost(ch)) {
throw new InvalidUrlException("An opaque host contains a forbidden host code point.");
}
// If input contains a code point that is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
if (p.validate() && !isUrlCodePoint(ch) && ch != '%') {
p.validationError("Code point \"" + ch + "\" is not a URL unit.");
}
//If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, invalid-URL-unit validation error.
if (p.validate() && ch == '%' && (input.length() - i < 2 || !isAsciiDigit(input.charAt(i + 1)) || !isAsciiDigit(input.charAt(i + 2)))) {
p.validationError("Code point \"" + ch + "\" is not a URL unit.");
}
}
//Return the result of running UTF-8 percent-encode on input using the C0 control percent-encode set.
String encoded = p.percentEncode(input, UrlParser::c0ControlPercentEncodeSet);
return new OpaqueHost(encoded);
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
else if (obj instanceof OpaqueHost other) {
return this.host.equals(other.host);
}
else {
return false;
}
}
@Override
public int hashCode() {
return this.host.hashCode();
}
@Override
public String toString() {
return this.host;
}
}
static final class EmptyHost implements Host {
@ -1876,16 +2271,14 @@ final class UrlParser {
return output.toString();
}
public static Ipv4Address parse(String input, @Nullable Consumer<String> validationErrorHandler) {
public static Ipv4Address parse(String input, UrlParser p) {
// Let parts be the result of strictly splitting input on U+002E (.).
List<String> parts = tokenize(input, ".");
List<String> parts = strictSplit(input, '.');
int partsSize = parts.size();
// If the last item in parts is the empty string, then:
if (parts.get(partsSize - 1).isEmpty()) {
// IPv4-empty-part validation error.
if (validationErrorHandler != null) {
validationErrorHandler.accept("IPv4 address ends with \".\"");
}
p.validationError("IPv4 address ends with \".\"");
// If partss size is greater than 1, then remove the last item from parts.
if (partsSize > 1) {
parts.remove(partsSize - 1);
@ -1903,8 +2296,8 @@ final class UrlParser {
String part = parts.get(i);
// Let result be the result of parsing part.
ParseIpv4NumberResult result = parseIpv4Number(part);
if (validationErrorHandler != null && result.validationError()) {
validationErrorHandler.accept("The IPv4 address contains numbers expressed using hexadecimal or octal digits.");
if (p.validate() && result.validationError()) {
p.validationError("The IPv4 address contains numbers expressed using hexadecimal or octal digits.");
}
// Append result to numbers.
numbers.add(result.number());
@ -1912,8 +2305,8 @@ final class UrlParser {
for (Iterator<Integer> iterator = numbers.iterator(); iterator.hasNext(); ) {
Integer number = iterator.next();
// If any item in numbers is greater than 255, IPv4-out-of-range-part validation error.
if (validationErrorHandler != null && number > 255) {
validationErrorHandler.accept("An IPv4 address part exceeds 255.");
if (p.validate() && number > 255) {
p.validationError("An IPv4 address part exceeds 255.");
}
if (iterator.hasNext()) {
// If any but the last item in numbers is greater than 255, then return failure.
@ -2356,6 +2749,8 @@ final class UrlParser {
boolean isOpaque();
Path clone();
String name();
}
static final class PathSegment implements Path {
@ -2384,6 +2779,15 @@ final class UrlParser {
this.segment.append(s);
}
@Override
public String name() {
String name = segment();
if (name.startsWith("/")) {
name = name.substring(1);
}
return name;
}
@Override
public boolean isEmpty() {
return this.segment.isEmpty();
@ -2483,6 +2887,16 @@ final class UrlParser {
return new PathSegments(this.segments);
}
@Override
public String name() {
StringBuilder output = new StringBuilder();
for (PathSegment segment : this.segments) {
output.append('/');
output.append(segment.name());
}
return output.toString();
}
@Override
public boolean equals(Object o) {
if (o == this) {