Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

LinkExtractor.java 3.0KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. /*
  2. * Copyright (c) 2006-2017 DMDirc Developers
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  5. * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
  6. * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
  7. * permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8. *
  9. * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
  10. * Software.
  11. *
  12. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
  13. * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
  14. * OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  15. * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  16. */
  17. package com.dmdirc.util.text;
  18. import java.util.ArrayList;
  19. import java.util.List;
  20. import java.util.regex.Matcher;
  21. import java.util.regex.Pattern;
  22. /**
  23. * Finds links within a body of text.
  24. */
  25. public class LinkExtractor {
  26. /** Defines all characters treated as trailing punctuation that are illegal in URLs. */
  27. private static final String URL_PUNCT_ILLEGAL = "\"";
  28. /** Defines all characters treated as trailing punctuation that're legal in URLs. */
  29. private static final String URL_PUNCT_LEGAL = "';:!,\\.\\?";
  30. /** Defines all trailing punctuation. */
  31. private static final String URL_PUNCT = URL_PUNCT_ILLEGAL + URL_PUNCT_LEGAL;
  32. /** Defines all characters allowed in URLs that aren't treated as trailing punct. */
  33. private static final String URL_NOPUNCT = "a-z0-9$\\-_@&\\+\\*\\(\\)=/#%~\\|";
  34. /** Defines all characters allowed in URLs per W3C specs. */
  35. private static final String URL_CHARS = '[' + URL_PUNCT_LEGAL + URL_NOPUNCT
  36. + "]*[" + URL_NOPUNCT + "]+[" + URL_PUNCT_LEGAL + URL_NOPUNCT + "]*";
  37. /** The regular expression to use for marking up URLs. */
  38. private static final String URL_REGEXP = "(?i)((?>(?<!" + "[a-f0-9]{5})[a-f]|[g-z+])+://" +
  39. URL_CHARS + "|(?<![a-z0-9:/])www\\." + URL_CHARS + ')';
  40. /** The pattern to use to match URLs. */
  41. private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEXP);
  42. /**
  43. * Finds all available links within the given text.
  44. *
  45. * @param text The text to search for links.
  46. * @return A list of found links, ordered according to their position in the text.
  47. */
  48. public List<Link> findLinks(final CharSequence text) {
  49. final List<Link> res = new ArrayList<>();
  50. final Matcher matcher = URL_PATTERN.matcher(text);
  51. while (matcher.find()) {
  52. res.add(new Link(matcher.start(), matcher.end(), matcher.group()));
  53. }
  54. return res;
  55. }
  56. }