Skip to content

Commit

Permalink
Added UrlNormalizer#removeTrailingFragment method.
Browse files Browse the repository at this point in the history
  • Loading branch information
essiembre committed Oct 19, 2024
1 parent adfd9d7 commit e02fcfb
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 44 deletions.
37 changes: 30 additions & 7 deletions src/main/java/com/norconex/commons/lang/url/UrlNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
* <ul>
* <li>{@link #removeDirectoryIndex() Remove directory index}</li>
* <li>{@link #removeFragment() Remove fragment (#)}</li>
* <li>{@link #removeTrailingFragment() Remove trailing fragment (#)}</li>
* <li>{@link #replaceIPWithDomainName() Replace IP with domain name}</li>
* <li>{@link #unsecureScheme() Unsecure scheme (https &rarr; http)}</li>
* <li>{@link #secureScheme() Secure scheme (http &rarr; https)}</li>
Expand Down Expand Up @@ -153,10 +154,11 @@ public class UrlNormalizer implements Serializable {
private static final Pattern PATTERN_PERCENT_ENCODED_CHAR =
Pattern.compile("(%[0-9a-f]{2})", Pattern.CASE_INSENSITIVE);
private static final Pattern PATTERN_PATH_LAST_SEGMENT = Pattern.compile(
"(.*/)(index\\.html|index\\.htm|index\\.shtml|index\\.php"
+ "|default\\.html|default\\.htm|home\\.html|home\\.htm"
+ "|index\\.php5|index\\.php4|index\\.php3|index\\.cgi"
+ "|placeholder\\.html|default\\.asp)$",
"""
(.*/)(index\\.html|index\\.htm|index\\.shtml|index\\.php\
|default\\.html|default\\.htm|home\\.html|home\\.htm\
|index\\.php5|index\\.php4|index\\.php3|index\\.cgi\
|placeholder\\.html|default\\.asp)$""",
Pattern.CASE_INSENSITIVE);
private static final Pattern PATTERN_DOMAIN = Pattern.compile(
"^[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}$",
Expand Down Expand Up @@ -699,16 +701,37 @@ public UrlNormalizer removeDirectoryIndex() {
}

/**
* <p>Removes the URL fragment (from the "#" character until the end).</p>
* <code>http://www.example.com/bar.html#section1 &rarr;
* http://www.example.com/bar.html</code>
* <p>Removes the URL fragment (from the first "#" character encountered
* to the end of the URL).</p>
* <code>http://www.example.com/abc.html#section1 &rarr;
* http://www.example.com/abc.html</code>
* <code>http://www.example.com/abc#/def/ghi &rarr;
* http://www.example.com/abc</code>
* <code>http://www.example.com/abc#def/ghi#klm &rarr;
* http://www.example.com/abc</code>
* @return this instance
*/
public UrlNormalizer removeFragment() {
url = url.replaceFirst("(.*?)(#.*)", "$1");
return this;
}

/**
* <p>Removes the URL fragment like {@link #removeFragment()}, but only if
* it is found after the last URL segment (/...).</p>
* <code>http://www.example.com/abc.html#section1 &rarr;
* http://www.example.com/abc.html</code>
* <code>http://www.example.com/abc#/def/ghi &rarr;
* http://www.example.com/abc#/def/ghi</code>
* <code>http://www.example.com/abc#def/ghi#klm &rarr;
* http://www.example.com/abc#def/ghi</code>
* @return this instance
*/
public UrlNormalizer removeTrailingFragment() {
url = url.replaceFirst("(.*?)(#[^\\/]*)$", "$1");
return this;
}

/**
* <p>Removes the URL query string (from the "?" character until the end
* or the first # character).
Expand Down
89 changes: 52 additions & 37 deletions src/test/java/com/norconex/commons/lang/url/UrlNormalizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,28 @@
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

public class UrlNormalizerTest {
class UrlNormalizerTest {

private String s;
private String t;

@AfterEach
public void tearDown() throws Exception {
void tearDown() throws Exception {
s = null;
t = null;
}

@Test
public void testAllAtOnce() {
s = "https://www.Example.org/0/../1/././%7ea_b:c\\d_|e~f!g "
+ "h/./^i^J[k]//l./m/n/o/../../p/q/r?cc=&dd=ee&bb=aa"
+ "#fragment";
void testAllAtOnce() {
s = """
https://www.Example.org/0/../1/././%7ea_b:c\\d_|e~f!g \
h/./^i^J[k]//l./m/n/o/../../p/q/r?cc=&dd=ee&bb=aa\
#fragment""";
t = "http://example.org/1/~a_b:c%5Cd_%7Ce~f!g%20h/%5Ei%5EJ%5Bk%5D/l./"
+ "m/p/q/r/?bb=aa&dd=ee";
//System.out.println("original : " + s);

UrlNormalizer n = new UrlNormalizer(s)
var n = new UrlNormalizer(s)
.addDirectoryTrailingSlash()
.addWWW()
.removeFragment()
Expand All @@ -71,7 +72,7 @@ public void testAllAtOnce() {
}

@Test
public void testAddDomainTrailingSlash() {
void testAddDomainTrailingSlash() {
s = "http://www.example.com";
t = "http://www.example.com/";
assertEquals(t, n(s).addDomainTrailingSlash().toString());
Expand Down Expand Up @@ -106,14 +107,14 @@ public void testAddDomainTrailingSlash() {
}

@Test
public void testEncodeUTF8Characters() {
void testEncodeUTF8Characters() {
s = "http://www.example.com/élève?série=0é0è";
t = "http://www.example.com/%C3%A9l%C3%A8ve?s%C3%A9rie=0%C3%A90%C3%A8";
assertEquals(t, n(s).encodeNonURICharacters().toString());
}

@Test
public void testEncodeNonURICharacters() {
void testEncodeNonURICharacters() {
s = "http://www.example.com/^a [b]/c?d e=";
t = "http://www.example.com/%5Ea%20%5Bb%5D/c?d+e=";
assertEquals(t, n(s).encodeNonURICharacters().toString());
Expand All @@ -126,14 +127,14 @@ public void testEncodeNonURICharacters() {
}

@Test
public void testEncodeSpaces() {
void testEncodeSpaces() {
s = "http://www.example.com/a b c?d e=f g";
t = "http://www.example.com/a%20b%20c?d+e=f+g";
assertEquals(t, n(s).encodeSpaces().toString());
}

@Test
public void testLowerCase() {
void testLowerCase() {

// All
s = "HTTP://www.Example.com/Path/Query?Param1=AAA&amp;Param2=BBB";
Expand Down Expand Up @@ -188,26 +189,27 @@ public void testLowerCase() {
}

@Test
public void testUpperCaseEscapeSequence() {
void testUpperCaseEscapeSequence() {
s = "http://www.example.com/a%c2%b1b";
t = "http://www.example.com/a%C2%B1b";
assertEquals(t, n(s).upperCaseEscapeSequence().toString());
}

@Test
public void testDecodeUnreservedCharacters() {
void testDecodeUnreservedCharacters() {
// ALPHA (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D),
// period (%2E), underscore (%5F), or tilde (%7E)
s = "http://www.example.com/%41%42%59%5Aalpha"
+ "%61%62%79%7A/digit%30%31%38%39/%2Dhyphen/period%2E"
+ "/underscore%5F/%7Etilde/reserved%2F%3A%5B%26";
s = """
http://www.example.com/%41%42%59%5Aalpha\
%61%62%79%7A/digit%30%31%38%39/%2Dhyphen/period%2E\
/underscore%5F/%7Etilde/reserved%2F%3A%5B%26""";
t = "http://www.example.com/ABYZalphaabyz/digit0189"
+ "/-hyphen/period./underscore_/~tilde/reserved%2F%3A%5B%26";
assertEquals(t, n(s).decodeUnreservedCharacters().toString());
}

@Test
public void testRemoveDefaultPort() {
void testRemoveDefaultPort() {
s = "http://www.example.com:80/bar.html";
t = "http://www.example.com/bar.html";
assertEquals(t, n(s).removeDefaultPort().toString());
Expand All @@ -232,7 +234,7 @@ public void testRemoveDefaultPort() {
}

@Test
public void testAddTrailingSlash() {
void testAddTrailingSlash() {
s = "http://www.example.com/alice";
t = "http://www.example.com/alice/";
assertEquals(t, n(s).addDirectoryTrailingSlash().toString());
Expand All @@ -255,7 +257,7 @@ public void testAddTrailingSlash() {
}

@Test
public void testRemoveTrailingSlash() {
void testRemoveTrailingSlash() {
s = "http://www.example.com/alice/";
t = "http://www.example.com/alice";
assertEquals(t, n(s).removeTrailingSlash().toString());
Expand Down Expand Up @@ -283,7 +285,7 @@ public void testRemoveTrailingSlash() {
}

@Test
public void testRemoveTrailingHash() {
void testRemoveTrailingHash() {
s = "http://www.example.com/blah#";
t = "http://www.example.com/blah";
assertEquals(t, n(s).removeTrailingHash().toString());
Expand All @@ -296,7 +298,7 @@ public void testRemoveTrailingHash() {
}

@Test
public void testRemoveDotSegments() {
void testRemoveDotSegments() {
s = "http://www.example.com/../a/b/../c/./d.html";
t = "http://www.example.com/a/c/d.html";
assertEquals(t, n(s).removeDotSegments().toString());
Expand All @@ -309,7 +311,7 @@ public void testRemoveDotSegments() {
assertEquals(t, n(s).removeDotSegments().toString());

//--- Tests from http://tools.ietf.org/html/rfc3986#section-5.4 ---
String urlRoot = "http://a.com";
var urlRoot = "http://a.com";
Map<String, String> m = new HashMap<>();

// 5.4.1 Normal Examples
Expand Down Expand Up @@ -353,7 +355,7 @@ public void testRemoveDotSegments() {
}

@Test
public void testRemoveDirectoryIndex() {
void testRemoveDirectoryIndex() {
s = "http://www.example.com/index.html";
t = "http://www.example.com/";
assertEquals(t, n(s).removeDirectoryIndex().toString());
Expand All @@ -375,7 +377,7 @@ public void testRemoveDirectoryIndex() {
}

@Test
public void testRemoveFragment() {
void testRemoveFragment() {
s = "http://www.example.com/bar.html#section1";
t = "http://www.example.com/bar.html";
assertEquals(t, n(s).removeFragment().toString());
Expand All @@ -385,7 +387,20 @@ public void testRemoveFragment() {
}

@Test
public void testRemoveQueryString() {
void testRemoveTrailingFragment() {
s = "http://www.example.com/abc.html#section1";
t = "http://www.example.com/abc.html";
assertEquals(t, n(s).removeTrailingFragment().toString());
s = "http://www.example.com/abc#/def/ghi";
t = "http://www.example.com/abc#/def/ghi";
assertEquals(t, n(s).removeTrailingFragment().toString());
s = "http://www.example.com/abc#def/ghi#klm";
t = "http://www.example.com/abc#def/ghi";
assertEquals(t, n(s).removeTrailingFragment().toString());
}

@Test
void testRemoveQueryString() {
s = "http://www.example.com/q?param1=AAA&param2=BBB";
t = "http://www.example.com/q";
assertEquals(t, n(s).removeQueryString().toString());
Expand All @@ -402,7 +417,7 @@ public void testRemoveQueryString() {

@Test
@Disabled("This test may not have proper network condition to execute.")
public void testReplaceIPWithDomainName() {
void testReplaceIPWithDomainName() {
s = "http://208.80.154.224/wiki/Main_Page";
t = null;
Assertions.assertTrue(
Expand All @@ -417,7 +432,7 @@ public void testReplaceIPWithDomainName() {
}

@Test
public void testUnsecureScheme() {
void testUnsecureScheme() {
s = "https://www.example.com/secure.html";
t = "http://www.example.com/secure.html";
assertEquals(t, n(s).unsecureScheme().toString());
Expand All @@ -430,7 +445,7 @@ public void testUnsecureScheme() {
}

@Test
public void testSecureScheme() {
void testSecureScheme() {
s = "https://www.example.com/secure.html";
t = "https://www.example.com/secure.html";
assertEquals(t, n(s).secureScheme().toString());
Expand All @@ -443,7 +458,7 @@ public void testSecureScheme() {
}

@Test
public void testRemoveDuplicateSlashes() {
void testRemoveDuplicateSlashes() {
s = "http://www.example.com/a//b///c////d/////e.html";
t = "http://www.example.com/a/b/c/d/e.html";
assertEquals(t, n(s).removeDuplicateSlashes().toString());
Expand All @@ -457,7 +472,7 @@ public void testRemoveDuplicateSlashes() {
}

@Test
public void testRemoveWWW() {
void testRemoveWWW() {
s = "http://www.example.com/foo.html";
t = "http://example.com/foo.html";
assertEquals(t, n(s).removeWWW().toString());
Expand All @@ -467,7 +482,7 @@ public void testRemoveWWW() {
}

@Test
public void testAddWWW() {
void testAddWWW() {
s = "http://example.com/foo.html";
t = "http://www.example.com/foo.html";
assertEquals(t, n(s).addWWW().toString());
Expand All @@ -480,7 +495,7 @@ public void testAddWWW() {
}

@Test
public void testSortQueryParameters() {
void testSortQueryParameters() {
// test with fragment
s = "http://example.com?z=1&a=1#frag";
t = "http://example.com?a=1&z=1#frag";
Expand All @@ -505,14 +520,14 @@ public void testSortQueryParameters() {
}

@Test
public void testRemoveEmptyParameters() {
void testRemoveEmptyParameters() {
s = "http://www.example.com/display?a=b&a=&c=d&e=&f=g&h&=i";
t = "http://www.example.com/display?a=b&c=d&f=g";
assertEquals(t, n(s).removeEmptyParameters().toString());
}

@Test
public void testRemoveTrailingQuestionMark() {
void testRemoveTrailingQuestionMark() {
s = "http://www.example.com/remove?";
t = "http://www.example.com/remove";
assertEquals(t, n(s).removeTrailingQuestionMark().toString());
Expand All @@ -525,7 +540,7 @@ public void testRemoveTrailingQuestionMark() {
}

@Test
public void testRemoveSessionIds() {
void testRemoveSessionIds() {
//PHP
s = "http://1.eg.com/app?a=b&PHPSESSID=f9f2770d591366bc&aa=bbb&c=d";
t = "http://1.eg.com/app?a=b&aa=bbb&c=d";
Expand Down Expand Up @@ -580,7 +595,7 @@ public void testRemoveSessionIds() {
// Test for supporting file:// scheme, from here:
// https://github.com/Norconex/commons-lang/issues/11
@Test
public void testFileScheme() {
void testFileScheme() {

// Encode non-URI characters
s = "file:///etc/some dir/my file.txt";
Expand Down

0 comments on commit e02fcfb

Please sign in to comment.