需要找出微博正文中的链接(主要为http链接),话题标签(#内容#),@用户,用正则表达式解决之,暂时找到的方案如下
1. 链接
正则表达式
(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)(([\\w\\-]+\\.){1,}?([\\w\\-.~]+\\/?)*[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)Java程序示例
/** * URL正则表达式 */ private static final Pattern urlPattern = Pattern.compile( "(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)" + "(([\\w\\-]+\\.){1,}?([\\w\\-.~]+\\/?)*" + "[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)",Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); /** * 去掉文本中URLs * @param text * @return */ public static String removeURLs(String text){ Matcher matcher; String newTweet = text.trim(); String cleanedText=""; while(!newTweet.equals(cleanedText)){ cleanedText=newTweet; matcher = urlPattern.matcher(cleanedText); newTweet = matcher.replaceAll(""); newTweet =newTweet.trim(); } return cleanedText; } /** * 获得文本中URL列表 * @param originalString * @return */ public static List<String> getURLs(String originalString){ List<String> urlsSet=new ArrayList<String>(); Matcher matcher = urlPattern.matcher(originalString); while (matcher.find()) { int matchStart = matcher.start(1); int matchEnd = matcher.end(); String tmpUrl=originalString.substring(matchStart,matchEnd); urlsSet.add(tmpUrl); // now you have the offsets of a URL match originalString=originalString.replace(tmpUrl,""); matcher = urlPattern.matcher(originalString); } return urlsSet; }
2. 话题标签
正则表达式
#[^#]+#Java程序示例
/** * Hashtag正则表达式 */ // private static final Pattern hashtagPattern = // Pattern.compile("(?:^|\\s|[\\p{Punct}&&[^/]])(#[\\p{L}0-9-_]+)"); private static final Pattern hashtagPattern = Pattern.compile("#[^#]+#"); private static String removeHashtags(String text){ Matcher matcher; String newTweet = text.trim(); String cleanedText=""; while(!newTweet.equals(cleanedText)){ cleanedText=newTweet; matcher = hashtagPattern.matcher(cleanedText); newTweet = matcher.replaceAll(""); newTweet =newTweet.trim(); } return cleanedText; } public static List<String> getHashtags(String originalString){ List<String> hashtagSet=new ArrayList<String>(); Matcher matcher = hashtagPattern.matcher(originalString); while (matcher.find()) { // int matchStart = matcher.start(1); int matchStart = matcher.start(); int matchEnd = matcher.end(); String tmpHashtag=originalString.substring(matchStart,matchEnd); hashtagSet.add(tmpHashtag); originalString=originalString.replace(tmpHashtag,""); matcher = hashtagPattern.matcher(originalString); } return hashtagSet; }
3. @用户
正则表达式
@[\u4e00-\u9fa5a-zA-Z0-9_-]{2,30}Java程序示例
/** * 用户@正则表达式 * 新浪微博中的用户名格式为是“4-30个字符,支持英文、数字、"_"或减号”,* 也就是说,支持中文、字母、数字、下划线及减号,并且是4到30个字符(这里暂且认为汉字为一个字符) * 那么在写匹配的表达式的时候就可以这么来写: @[\u4e00-\u9fa5a-zA-Z0-9_-]{4,30} */ // private static final Pattern usermentionPattern = // Pattern.compile("(?:^|\\s|[\\p{Punct}&&[^/]])(@[\\p{L}0-9-_]+)"); private static final Pattern usermentionPattern = Pattern.compile("@[\u4e00-\u9fa5a-zA-Z0-9_-]{2,30}"); public static String removeUserMentions(String text){ Matcher matcher; String newTweet = text.trim(); String cleanedText=""; while(!newTweet.equals(cleanedText)){ cleanedText=newTweet; matcher = usermentionPattern.matcher(cleanedText); newTweet = matcher.replaceAll(""); newTweet =newTweet.trim(); } return cleanedText; } public static List<String> getUsermentions(String originalString){ List<String> usermentionsSet=new ArrayList<String>(); Matcher matcher = usermentionPattern.matcher(originalString); while (matcher.find()) { // int matchStart = matcher.start(1); int matchStart = matcher.start(); int matchEnd = matcher.end(); String tmpUsermention=originalString.substring(matchStart,matchEnd); usermentionsSet.add(tmpUsermention); originalString=originalString.replace(tmpUsermention,""); matcher = usermentionPattern.matcher(originalString); } return usermentionsSet; }