我想为下载的
HTML文件的集合创建一个语料库,然后在R中读取它们以供将来的文本挖掘.
从本质上讲,这就是我想要做的:
>从多个html文件创建语料库.
我尝试使用DirSource:
library(tm) a<- DirSource("C:/test") b<-Corpus(DirSource(a),readerControl=list(language="eng",reader=readPlain))
但它返回“无效的目录参数”
>立即从Corpus读取html文件.
不知道怎么做.
>解析它们,将它们转换为纯文本,删除标签.
很多人建议使用XML,但是,我找不到处理多个文件的方法.它们都是一个文件.
非常感谢.
解决方法
这应该做到这一点.在这里,我的计算机上有一个HTML文件的文件夹(来自SO的随机样本),我用它们创建了一个语料库,然后是一个文档术语矩阵,然后完成了一些简单的文本挖掘任务.
# get data setwd("C:/Downloads/html") # this folder has your HTML files html <- list.files(pattern="\\.(htm|html)$") # get just .htm and .html files # load packages library(tm) library(RCurl) library(XML) # get some code from github to convert HTML to text writeChar(con="htmlToText.R",(getURL(ssl.verifypeer = FALSE,"https://raw.github.com/tonybreyal/Blog-Reference-Functions/master/R/htmlToText/htmlToText.R"))) source("htmlToText.R") # convert HTML to text html2txt <- lapply(html,htmlToText) # clean out non-ASCII characters html2txtclean <- sapply(html2txt,function(x) iconv(x,"latin1","ASCII",sub="")) # make corpus for text mining corpus <- Corpus(VectorSource(html2txtclean)) # process text... skipWords <- function(x) removeWords(x,stopwords("english")) funcs <- list(tolower,removePunctuation,removeNumbers,stripWhitespace,skipWords) a <- tm_map(a,PlainTextDocument) a <- tm_map(corpus,FUN = tm_reduce,tmFuns = funcs) a.dtm1 <- TermDocumentMatrix(a,control = list(wordLengths = c(3,10))) newstopwords <- findFreqTerms(a.dtm1,lowfreq=10) # get most frequent words # remove most frequent words for this corpus a.dtm2 <- a.dtm1[!(a.dtm1$dimnames$Terms) %in% newstopwords,] inspect(a.dtm2) # carry on with typical things that can now be done,ie. cluster analysis a.dtm3 <- removeSparseTerms(a.dtm2,sparse=0.7) a.dtm.df <- as.data.frame(inspect(a.dtm3)) a.dtm.df.scale <- scale(a.dtm.df) d <- dist(a.dtm.df.scale,method = "euclidean") fit <- hclust(d,method="ward") plot(fit)
# just for fun... library(wordcloud) library(RColorBrewer) m = as.matrix(t(a.dtm1)) # get word counts in decreasing order word_freqs = sort(colSums(m),decreasing=TRUE) # create a data frame with words and their frequencies dm = data.frame(word=names(word_freqs),freq=word_freqs) # plot wordcloud wordcloud(dm$word,dm$freq,random.order=FALSE,colors=brewer.pal(8,"Dark2"))