Rによる自動データ収集 読書めも4

estis2018/05/13 (日) 16:43 に投稿

p.320

start_session(root = "http://localhost:4444/wd/hub/", browser = "firefox")

で、browser = "firefox" だと、エラーが出るので、

https://sites.google.com/a/chromium.org/chromedriver/downloads

から、Chromedriver をダウンロードして、browser = "chrome" にすることにした。

> start_session(root = "http://localhost:4444/wd/hub/", browser = "chrome")
[1] "Started new session. sessionList created."
> post.url(url = "http://www.r-datacollection.com/materials/selenium/intro.html")
> get.url()
[1] "http://www.r-datacollection.com/materials/selenium/intro.html"
> page_title()
[1] "The Federal Contributions Database"
> buttonID <- element_xpath_find(value = "/html/body/div/div[2]/form/input")
> element_click(ID = buttonID)
> allHandles <- window_handles()
> window_change(allHandles[1])
 rawToChar(getURLContent(paste0(seleniumSession$sessionURL, "/window"),  でエラー: 
   引数 'x' は raw ベクトルでなくてはなりません 
> getURLContent(paste0(seleniumSession$sessionURL, "/window")
+ )
[1] "{\"sessionId\":\"91b4ba4356a57ec8497f3973cd7e6802\",\"status\":0,\"value\":\"CDwindow-268FE40834454ABD212F9FCCE422B65E\"}"
attr(,"Content-Type")
                              charset 
"application/json"            "utf-8" 
> yearID <- element_xpath_find(value = '//*[@id="yearSelect"]')
> monthID <- element_xpath_find(value = '//*[@id="monthSelect"]')
> recipID <- element_xpath_find(value = '//*[@id="recipientSelect"]')
> element_click(yearID)
> keys("2013")
[1] "{\"sessionId\":\"91b4ba4356a57ec8497f3973cd7e6802\",\"status\":0,\"value\":null}"
attr(,"Content-Type")
                              charset 
"application/json"            "utf-8" 
> element_click(monthID)
> keys("January")
[1] "{\"sessionId\":\"91b4ba4356a57ec8497f3973cd7e6802\",\"status\":0,\"value\":null}"
attr(,"Content-Type")
                              charset 
"application/json"            "utf-8" 
> element_click(recipID)
> keys("barack Obama")
[1] "{\"sessionId\":\"91b4ba4356a57ec8497f3973cd7e6802\",\"status\":0,\"value\":null}"
attr(,"Content-Type")
                              charset 
"application/json"            "utf-8" 
> submitID <- element_xpath_find(value = '//*[@id="yearForm"]/div/button')
> element_click(submitID)
> pageSource <- page_source()
> moneyTab <- readHTMLTable(pageSource, which = 1)
> colnames(moneyTab) <- c("year", "name", "party", "contributor", "state", "amount")
> moneyTab <- moneyTab[-1, ]
> head(moneyTab)
  year         name party             contributor state amount
2 2013 Barack Obama     D           ROBERTS, GARY    TX    -50
3 2013 Barack Obama     D    TOENNIES, MICHAEL MR    CO    -55
4 2013 Barack Obama     D           PENTA, NEELAM    NY   -100
5 2013 Barack Obama     D        VALENSTEIN, JILL    NY    -15
6 2013 Barack Obama     D SPRECHER KEATING, KAREN    DC   -100
7 2013 Barack Obama     D         FISCHER, DAMIEN    CA   -100

p.325 のデータ取得には成功しているが、

rawToChar(getURLContent(paste0(seleniumSession$sessionURL, "/window"), でエラー: 引数 'x' は raw ベクトルでなくてはなりません

というエラーが気になる。

Comments

Comment

rawToChar(getURLContent(paste0(seleniumSession$sessionURL, "/window"), でエラー: 引数 'x' は raw ベクトルでなくてはなりません

は、window_change 関数を書き直すことで、出なくなった。

function (handle = NULL) 
{
    if (is.null(handle)) {
        stop("Missing handle")
    }
    rawToChar(getURLContent(paste0(seleniumSession$sessionURL, 
        "/window"), customrequest = "POST", httpheader = c(`Content-Type` = "application/json;charset=UTF-8"), 
        postfields = toJSON(list(name = handle))))
}

function (handle = NULL) 
{
    if (is.null(handle)) {
        stop("Missing handle")
    }
    getURLContent(paste0(seleniumSession$sessionURL, 
        "/window"), customrequest = "POST", httpheader = c(`Content-Type` = "application/json;charset=UTF-8"), 
        postfields = toJSON(list(name = handle)))
}

に変更した。