使用 XML 包将 html 表刮入 R 数据框架

如何使用 XML 包刮取 html 表?

巴西足球队上的这个维基百科页面为例。我想阅读它在 R,并得到“所有比赛的名单,巴西已经与国际足联认可的球队”表作为一个数据框架。我怎么能这么做?

126066 次浏览
library(RCurl)
library(XML)


# Download page using RCurl
# You may need to set proxy details, etc.,  in the call to getURL
theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
webpage <- getURL(theurl)
# Process escape characters
webpage <- readLines(tc <- textConnection(webpage)); close(tc)


# Parse the html tree, ignoring errors on the page
pagetree <- htmlTreeParse(webpage, error=function(...){})


# Navigate your way through the tree. It may be possible to do this more efficiently using getNodeSet
body <- pagetree$children$html$children$body
divbodyContent <- body$children$div$children[[1]]$children$div$children[[4]]
tables <- divbodyContent$children[names(divbodyContent)=="table"]


#In this case, the required table is the only one with class "wikitable sortable"
tableclasses <- sapply(tables, function(x) x$attributes["class"])
thetable  <- tables[which(tableclasses=="wikitable sortable")]$table


#Get columns headers
headers <- thetable$children[[1]]$children
columnnames <- unname(sapply(headers, function(x) x$children$text$value))


# Get rows from table
content <- c()
for(i in 2:length(thetable$children))
{
tablerow <- thetable$children[[i]]$children
opponent <- tablerow[[1]]$children[[2]]$children$text$value
others <- unname(sapply(tablerow[-1], function(x) x$children$text$value))
content <- rbind(content, c(opponent, others))
}


# Convert to data frame
colnames(content) <- columnnames
as.data.frame(content)

编辑补充:

样本输出

                     Opponent Played Won Drawn Lost Goals for Goals against  % Won
1               Argentina     94  36    24   34       148           150  38.3%
2                Paraguay     72  44    17   11       160            61  61.1%
3                 Uruguay     72  33    19   20       127            93  45.8%
...

使用 Xpath 的另一个选项。

library(RCurl)
library(XML)


theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)


pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)


# Extract table header and contents
tablehead <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/th", xmlValue)
results <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/td", xmlValue)


# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 8, byrow = TRUE))


# Clean up the results
content[,1] <- gsub(" ", "", content[,1])
tablehead <- gsub(" ", "", tablehead)
names(content) <- tablehead

产生这个结果

> head(content)
Opponent Played Won Drawn Lost Goals for Goals against % Won
1 Argentina     94  36    24   34       148           150 38.3%
2  Paraguay     72  44    17   11       160            61 61.1%
3   Uruguay     72  33    19   20       127            93 45.8%
4     Chile     64  45    12    7       147            53 70.3%
5      Peru     39  27     9    3        83            27 69.2%
6    Mexico     36  21     6    9        69            34 58.3%

或者更短的尝试:

library(XML)
library(RCurl)
library(rlist)
theurl <- getURL("https://en.wikipedia.org/wiki/Brazil_national_football_team",.opts = list(ssl.verifypeer = FALSE) )
tables <- readHTMLTable(theurl)
tables <- list.clean(tables, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))

挑选的桌子是页面上最长的一张

tables[[which.max(n.rows)]]

rvestxml2是解析 html 网页的另一个流行软件包。

library(rvest)
theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
file<-read_html(theurl)
tables<-html_nodes(file, "table")
table1 <- html_table(tables[4], fill = TRUE)

语法比 xml软件包更容易使用,而且对于大多数网页,该软件包提供了所需的所有选项。