Ver código fonte

Start work on a generic link extractor.

At the moment this just implements the first step of the Styliser's
link detection. It doesn't apply any of the "intelligent" corrections.
pull/1/head
Chris Smith 9 anos atrás
pai
commit
fba5088719

+ 93
- 0
src/com/dmdirc/util/text/Link.java Ver arquivo

@@ -0,0 +1,93 @@
1
+/*
2
+ * Copyright (c) 2006-2014 DMDirc Developers
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ * of this software and associated documentation files (the "Software"), to deal
6
+ * in the Software without restriction, including without limitation the rights
7
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ * copies of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in
12
+ * all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ * SOFTWARE.
21
+ */
22
+
23
+package com.dmdirc.util.text;
24
+
25
+import java.util.Objects;
26
+
27
+/**
28
+* Describes a single link found within some text.
29
+*/
30
+public class Link {
31
+
32
+    private final int start;
33
+    private final int end;
34
+    private final String content;
35
+
36
+    public Link(final int start, final int end, final String content) {
37
+        this.start = start;
38
+        this.end = end;
39
+        this.content = content;
40
+    }
41
+
42
+    /**
43
+     * Gets the character offset that the link was found at.
44
+     *
45
+     * @return The position the link was found at
46
+     */
47
+    public int getStart() {
48
+        return start;
49
+    }
50
+
51
+    /**
52
+     * Gets the character offset of the end of the link.
53
+     *
54
+     * @return The position immediately after the last character of the link.
55
+     */
56
+    public int getEnd() {
57
+        return end;
58
+    }
59
+
60
+    /**
61
+     * Gets the content of the link.
62
+     *
63
+     * @return The link's content
64
+     */
65
+    public String getContent() {
66
+        return content;
67
+    }
68
+
69
+    @Override
70
+    public boolean equals(final Object o) {
71
+        if (this == o) {
72
+            return true;
73
+        }
74
+
75
+        if (o == null || getClass() != o.getClass()) {
76
+            return false;
77
+        }
78
+
79
+        final Link link = (Link) o;
80
+        return end == link.getEnd() && start == link.getStart()
81
+                && Objects.equals(content, link.getContent());
82
+    }
83
+
84
+    @Override
85
+    public int hashCode() {
86
+        return Objects.hash(start, end, content);
87
+    }
88
+
89
+    @Override
90
+    public String toString() {
91
+        return "Link{start=" + start + ", end=" + end + ", content='" + content + "'}";
92
+    }
93
+}

+ 69
- 0
src/com/dmdirc/util/text/LinkExtractor.java Ver arquivo

@@ -0,0 +1,69 @@
1
+/*
2
+ * Copyright (c) 2006-2014 DMDirc Developers
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ * of this software and associated documentation files (the "Software"), to deal
6
+ * in the Software without restriction, including without limitation the rights
7
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ * copies of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in
12
+ * all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ * SOFTWARE.
21
+ */
22
+
23
+package com.dmdirc.util.text;
24
+
25
+import java.util.ArrayList;
26
+import java.util.List;
27
+import java.util.regex.Matcher;
28
+import java.util.regex.Pattern;
29
+
30
+/**
31
+ * Finds links within a body of text.
32
+ */
33
+public class LinkExtractor {
34
+
35
+    /** Defines all characters treated as trailing punctuation that are illegal in URLs. */
36
+    private static final String URL_PUNCT_ILLEGAL = "\"";
37
+    /** Defines all characters treated as trailing punctuation that're legal in URLs. */
38
+    private static final String URL_PUNCT_LEGAL = "';:!,\\.\\?";
39
+    /** Defines all trailing punctuation. */
40
+    private static final String URL_PUNCT = URL_PUNCT_ILLEGAL + URL_PUNCT_LEGAL;
41
+    /** Defines all characters allowed in URLs that aren't treated as trailing punct. */
42
+    private static final String URL_NOPUNCT = "a-z0-9$\\-_@&\\+\\*\\(\\)=/#%~\\|";
43
+    /** Defines all characters allowed in URLs per W3C specs. */
44
+    private static final String URL_CHARS = '[' + URL_PUNCT_LEGAL + URL_NOPUNCT
45
+            + "]*[" + URL_NOPUNCT + "]+[" + URL_PUNCT_LEGAL + URL_NOPUNCT + "]*";
46
+    /** The regular expression to use for marking up URLs. */
47
+    private static final String URL_REGEXP = "(?i)((?>(?<!" + "[a-f0-9]{5})[a-f]|[g-z+])+://" +
48
+            URL_CHARS + "|(?<![a-z0-9:/])www\\." + URL_CHARS + ')';
49
+    /** The pattern to use to match URLs. */
50
+    private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEXP);
51
+
52
+    /**
53
+     * Finds all available links within the given text.
54
+     *
55
+     * @param text The text to search for links.
56
+     * @return A list of found links, ordered according to their position in the text.
57
+     */
58
+    public List<Link> findLinks(final CharSequence text) {
59
+        final List<Link> res = new ArrayList<>();
60
+        final Matcher matcher = URL_PATTERN.matcher(text);
61
+
62
+        while (matcher.find()) {
63
+            res.add(new Link(matcher.start(), matcher.end(), matcher.group()));
64
+        }
65
+
66
+        return res;
67
+    }
68
+
69
+}

+ 143
- 0
test/com/dmdirc/util/text/LinkExtractorTest.java Ver arquivo

@@ -0,0 +1,143 @@
1
+/*
2
+ * Copyright (c) 2006-2014 DMDirc Developers
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ * of this software and associated documentation files (the "Software"), to deal
6
+ * in the Software without restriction, including without limitation the rights
7
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ * copies of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in
12
+ * all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ * SOFTWARE.
21
+ */
22
+
23
+package com.dmdirc.util.text;
24
+
25
+import java.util.Arrays;
26
+import java.util.List;
27
+
28
+import org.junit.Before;
29
+import org.junit.Ignore;
30
+import org.junit.Test;
31
+
32
+import static org.junit.Assert.assertEquals;
33
+
34
+public class LinkExtractorTest {
35
+
36
+    private LinkExtractor extractor;
37
+
38
+    @Before
39
+    public void setup() {
40
+        extractor = new LinkExtractor();
41
+    }
42
+
43
+    private void testLinks(final CharSequence input, final Link ... expected) {
44
+        final List<Link> result = extractor.findLinks(input);
45
+        assertEquals(Arrays.asList(expected), result);
46
+    }
47
+
48
+    @Test
49
+    public void testEmptyString() {
50
+        testLinks("");
51
+    }
52
+
53
+    @Test
54
+    public void testNoLinks() {
55
+        testLinks("no links here!");
56
+    }
57
+
58
+    @Test
59
+    public void testPlainWwwLink() {
60
+        testLinks("www.google.com", new Link(0, 14, "www.google.com"));
61
+    }
62
+
63
+    @Test
64
+    @Ignore("Not implemented yet")
65
+    public void testPlainWwwLinkInSentence() {
66
+        testLinks("word www.google.com word", new Link(5, 19, "www.google.com"));
67
+        testLinks("word www.google.com, word", new Link(5, 19, "www.google.com"));
68
+    }
69
+
70
+    @Test
71
+    @Ignore("Not implemented yet")
72
+    public void testPlainWwwLinkWithPunctuation() {
73
+        testLinks("www.google.com.", new Link(0, 14, "www.google.com"));
74
+        testLinks("www.google.com!", new Link(0, 14, "www.google.com"));
75
+        testLinks("www.google.com?", new Link(0, 14, "www.google.com"));
76
+    }
77
+
78
+    @Test
79
+    @Ignore("Not implemented yet")
80
+    public void testPlainWwwLinkWithBrackets() {
81
+        testLinks("(www.google.com)", new Link(1, 15, "www.google.com"));
82
+        testLinks("[www.google.com]", new Link(1, 15, "www.google.com"));
83
+        testLinks("<www.google.com>", new Link(1, 15, "www.google.com"));
84
+    }
85
+
86
+    @Test
87
+    @Ignore("Not implemented yet")
88
+    public void testPlainWwwLinkWithComplicatedBrackets() {
89
+        testLinks("(foo: www.google.com)", new Link(6, 21, "www.google.com"));
90
+        testLinks("[foo: www.google.com]", new Link(6, 21, "www.google.com"));
91
+        testLinks("<foo: www.google.com>", new Link(6, 21, "www.google.com"));
92
+
93
+        testLinks("(foo: 'www.google.com')", new Link(7, 22, "www.google.com"));
94
+        testLinks("('www.google.com')", new Link(2, 17, "www.google.com"));
95
+        testLinks("[\"www.google.com\"]", new Link(2, 17, "www.google.com"));
96
+
97
+        testLinks("www.example.com/blah(foobar)", new Link(0, 28, "www.example.com/blah(foobar)"));
98
+    }
99
+
100
+    @Test
101
+    @Ignore("Not implemented yet")
102
+    public void testPlainWwwLinkWithQuotes() {
103
+        testLinks("\"www.google.com\"", new Link(1, 15, "www.google.com"));
104
+        testLinks("'www.google.com'", new Link(1, 15, "www.google.com"));
105
+    }
106
+
107
+    @Test
108
+    public void testPrefixedLinks() {
109
+        testLinks("http://www.google.com", new Link(0, 21, "http://www.google.com"));
110
+        testLinks("http://www.google.com:80/test#flub",
111
+                new Link(0, 34, "http://www.google.com:80/test#flub"));
112
+
113
+        testLinks("svn+ssh://foo:bar", new Link(0, 17, "svn+ssh://foo:bar"));
114
+    }
115
+
116
+    @Test
117
+    @Ignore("Not implemented yet")
118
+    public void testMultipleLinks() {
119
+        testLinks("www.foo.com www.bar.com",
120
+                new Link(0, 11, "www.foo.com"),
121
+                new Link(12, 23, "www.bar.com"));
122
+
123
+        testLinks("(www.foo.com www.bar.com)",
124
+                new Link(1, 12, "www.foo.com"),
125
+                new Link(13, 24, "www.bar.com"));
126
+
127
+        testLinks("('www.foo.com')->x(\"www.bar.com\")",
128
+                new Link(2, 13, "www.foo.com"),
129
+                new Link(20, 31, "www.bar.com"));
130
+
131
+        testLinks("\"foo\" www.foo.com \"www.bar.com\"",
132
+                new Link(6, 17, "www.foo.com"),
133
+                new Link(19, 30, "www.bar.com"));
134
+    }
135
+
136
+    @Test
137
+    public void testIncompleteLinks() {
138
+        testLinks("www...");
139
+        testLinks("http://...");
140
+        testLinks("/var/web/www.google.com/blah");
141
+    }
142
+
143
+}

Carregando…
Cancelar
Salvar