/* * Copyright (c) 2006-2015 DMDirc Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ package com.dmdirc.util.text; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Finds links within a body of text. */ public class LinkExtractor { /** Defines all characters treated as trailing punctuation that are illegal in URLs. */ private static final String URL_PUNCT_ILLEGAL = "\""; /** Defines all characters treated as trailing punctuation that're legal in URLs. */ private static final String URL_PUNCT_LEGAL = "';:!,\\.\\?"; /** Defines all trailing punctuation. */ private static final String URL_PUNCT = URL_PUNCT_ILLEGAL + URL_PUNCT_LEGAL; /** Defines all characters allowed in URLs that aren't treated as trailing punct. */ private static final String URL_NOPUNCT = "a-z0-9$\\-_@&\\+\\*\\(\\)=/#%~\\|"; /** Defines all characters allowed in URLs per W3C specs. */ private static final String URL_CHARS = '[' + URL_PUNCT_LEGAL + URL_NOPUNCT + "]*[" + URL_NOPUNCT + "]+[" + URL_PUNCT_LEGAL + URL_NOPUNCT + "]*"; /** The regular expression to use for marking up URLs. */ private static final String URL_REGEXP = "(?i)((?>(? findLinks(final CharSequence text) { final List res = new ArrayList<>(); final Matcher matcher = URL_PATTERN.matcher(text); while (matcher.find()) { res.add(new Link(matcher.start(), matcher.end(), matcher.group())); } return res; } }