You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

LinkExtractor.java 3.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. /*
  2. * Copyright (c) 2006-2015 DMDirc Developers
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a copy
  5. * of this software and associated documentation files (the "Software"), to deal
  6. * in the Software without restriction, including without limitation the rights
  7. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. * copies of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20. * SOFTWARE.
  21. */
  22. package com.dmdirc.util.text;
  23. import java.util.ArrayList;
  24. import java.util.List;
  25. import java.util.regex.Matcher;
  26. import java.util.regex.Pattern;
  27. /**
  28. * Finds links within a body of text.
  29. */
  30. public class LinkExtractor {
  31. /** Defines all characters treated as trailing punctuation that are illegal in URLs. */
  32. private static final String URL_PUNCT_ILLEGAL = "\"";
  33. /** Defines all characters treated as trailing punctuation that're legal in URLs. */
  34. private static final String URL_PUNCT_LEGAL = "';:!,\\.\\?";
  35. /** Defines all trailing punctuation. */
  36. private static final String URL_PUNCT = URL_PUNCT_ILLEGAL + URL_PUNCT_LEGAL;
  37. /** Defines all characters allowed in URLs that aren't treated as trailing punct. */
  38. private static final String URL_NOPUNCT = "a-z0-9$\\-_@&\\+\\*\\(\\)=/#%~\\|";
  39. /** Defines all characters allowed in URLs per W3C specs. */
  40. private static final String URL_CHARS = '[' + URL_PUNCT_LEGAL + URL_NOPUNCT
  41. + "]*[" + URL_NOPUNCT + "]+[" + URL_PUNCT_LEGAL + URL_NOPUNCT + "]*";
  42. /** The regular expression to use for marking up URLs. */
  43. private static final String URL_REGEXP = "(?i)((?>(?<!" + "[a-f0-9]{5})[a-f]|[g-z+])+://" +
  44. URL_CHARS + "|(?<![a-z0-9:/])www\\." + URL_CHARS + ')';
  45. /** The pattern to use to match URLs. */
  46. private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEXP);
  47. /**
  48. * Finds all available links within the given text.
  49. *
  50. * @param text The text to search for links.
  51. * @return A list of found links, ordered according to their position in the text.
  52. */
  53. public List<Link> findLinks(final CharSequence text) {
  54. final List<Link> res = new ArrayList<>();
  55. final Matcher matcher = URL_PATTERN.matcher(text);
  56. while (matcher.find()) {
  57. res.add(new Link(matcher.start(), matcher.end(), matcher.group()));
  58. }
  59. return res;
  60. }
  61. }