html – 从R中的多个网页上的表格中刮取数据(足球运动员)

前端之家收集整理的这篇文章主要介绍了html – 从R中的多个网页上的表格中刮取数据(足球运动员)前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。




这是我第一次真正进入数据抓取,所以我试着用答案找到类似的问题.我找到的最接近的答案是this question


塞缪尔·文图拉(Samuel L. Ventura)最近也发表了关于NFL数据数据搜集的讨论,可以在here找到.



> # unlist into a single character vector
> links <- unlist(links)
> # Go to each URL in the list and scrape all the data from the tables
> # this will take some time... don't interrupt it! 
> all_tables <- lapply(links,readHTMLTable,stringsAsFactors = FALSE)
Error in UseMethod("xmlNamespaceDefinitions") : 
 no applicable method for 'xmlNamespaceDefinitions' applied to an object of class "NULL"
> # Put player names in the list so we know who the data belong to
> # extract names from the URLs to their stats page...
> toMatch <- c("","-1.html")
> player_names <- unique (gsub(paste(toMatch,collapse="|"),"",links))
Error: cannot allocate vector of size 512 Kb
> # assign player names to list of tables
> names(all_tables) <- player_names
Error: object 'player_names' not found
> fix(inx_page)
Error in edit(name,file,title,editor) : 
  unexpected '<' occurred on line 1
 use a command like
 x <- edit()
 to recover
In addition: Warning message:
In edit.default(name,editor = defaultEditor) :
  deparse may be incomplete


> all_tables <- lapply(links,stringsAsFactors = FALSE)
Error in UseMethod("xmlNamespaceDefinitions") : 
 no applicable method for 'xmlNamespaceDefinitions' applied to an object of class "NULL"


links[[i]] <- paste0("",lnk)




首先列出所有玩家页面的URL …

require(RCurl); require(XML)
n <- length(letters) 
# pre-allocate list to fill
links <- vector("list",length = n)
for(i in 1:n){
  print(i) # keep track of what the function is up to
  # get all html on each page of the a-z index pages
  inx_page <- htmlParse(getURI(paste0("",letters[i],"-index.html")))
  # scrape URLs for each player from each index page
  lnk <- unname(xpathSApply(inx_page,"//a/@href"))
  # skip first 63 and last 10 links as they are constant on each page
  lnk <- lnk[-c(1:63,(length(lnk)-10):length(lnk))]
  # only keep links that go to players (exclude schools)
  lnk <- lnk[grep("players",lnk)]
  # now we have a list of all the URLs to all the players on that index page
  # but the URLs are incomplete,so let's complete them so we can use them from 
  # anywhere
  links[[i]] <- paste0("",lnk)
# unlist into a single character vector
links <- unlist(links)



# Go to each URL in the list and scrape all the data from the tables
# this will take some time... don't interrupt it!
# start edit1 here - just so you can see what's changed
    # pre-allocate list
all_tables <- vector("list",length = (length(links)))
for(i in 1:length(links)){
  # error handling - skips to next URL if it gets an error
  result <- try(
    all_tables[[i]] <- readHTMLTable(links[i],stringsAsFactors = FALSE)
  ); if(class(result) == "try-error") next;
# end edit1 here
# Put player names in the list so we know who the data belong to
# extract names from the URLs to their stats page...
toMatch <- c("","-1.html")
player_names <- unique (gsub(paste(toMatch,links))
# assign player names to list of tables
names(all_tables) <- player_names


   Year School Conf Class Pos Solo Ast Tot Loss  Sk Int Yds Avg TD PD FR Yds TD FF
1 *2007   Utah  MWC    FR  DL    2   1   3  0.0 0.0   0   0      0  0  0   0  0  0
2 *2010   Utah  MWC    SR  DL    4   4   8  2.5 1.5   0   0      0  1  0   0  0  0

   Year School Conf Class Pos Ret Yds  Avg TD Ret Yds Avg TD
1 *2007   Utah  MWC    FR  DL   0   0       0   0   0      0
2 *2010   Utah  MWC    SR  DL   2  24 12.0  0   0   0      0

   Year School Conf Class Pos Rec Yds  Avg TD Att Yds Avg TD Plays Yds  Avg TD
1 *2007   Utah  MWC    FR  DL   1  41 41.0  0   0   0      0     1  41 41.0  0
2 *2010   Utah  MWC    SR  DL   0   0       0   0   0      0     0   0       0


# just show passing tables
passing <- lapply(all_tables,function(i) i$passing)
# but lots of NULL in here,and not a convenient format,so...
passing <-,passing)


Year             School Conf Class Pos Cmp Att  Pct  Yds Y/A AY/A TD Int  Rate
james-aaron  1978          Air Force  Ind        QB  28  56 50.0  316 5.6  3.6  1   3  92.6
jeff-aaron.1 2000 Alabama-Birmingham CUSA    JR  QB 100 182 54.9 1135 6.2  6.0  5   3 113.1
jeff-aaron.2 2001 Alabama-Birmingham CUSA    SR  QB  77 148 52.0  828 5.6  4.3  4   6  99.8
