Skip to content

Commit

Permalink
cut out permission checking method 'robotstxt' -- fixes #22
Browse files Browse the repository at this point in the history
  • Loading branch information
petermeissner committed Feb 10, 2018
1 parent fcd02f0 commit 476bcab
Show file tree
Hide file tree
Showing 18 changed files with 91 additions and 551 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: robotstxt
Date: 2017-11-12
Date: 2018-02-10
Type: Package
Title: A 'robots.txt' Parser and 'Webbot'/'Spider'/'Crawler' Permissions Checker
Version: 0.5.2
Version: 0.6.0
Authors@R: c(
person(
"Peter", "Meissner", role = c("aut", "cre"),
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@ NEWS robotstxt
==========================================================================


0.6.0 | 2018-02-10
--------------------------------------------------------------------------

- **change/fix** check function paths_allowed() would not return correct result in some edge cases, indicating that spiderbar/rep-cpp check method is more reliable and shall be the default and only method: [see 1](https://github.com/ropenscilabs/robotstxt/issues/22), [see 2](https://github.com/hrbrmstr/spiderbar/issues/2), [see 3](https://github.com/seomoz/rep-cpp/issues/33)




0.5.2 | 2017-11-12
--------------------------------------------------------------------------

Expand Down
7 changes: 6 additions & 1 deletion R/get_robotstxt.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,12 @@ get_robotstxt <-

# ok
if( request$status < 400 ){
rtxt <- httr::content(request, encoding="UTF-8", as="text")
rtxt <-
httr::content(
request,
encoding="UTF-8",
as="text"
)

# check if robots.txt is parsable
if ( is_valid_robotstxt(rtxt) ){
Expand Down
86 changes: 0 additions & 86 deletions R/path_allowed.R

This file was deleted.

53 changes: 13 additions & 40 deletions R/paths_allowed.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
#' check if a bot has permissions to access page(s)
#'
#' wrapper to \code{\link{path_allowed}}
#'
#' @param domain Domain for which paths should be checked. Defaults to "auto".
#' If set to "auto" function will try to guess the domain by parsing the paths
#' argument. Note however, that these are educated guesses which might utterly
#' fail. To be on the save side, provide appropriate domains manually.
#' @param bot name of the bot, defaults to "*"
#' @param paths paths for which to check bot's permission, defaults to "/"
#' @param check_method which method to use for checking -- either
#' "robotstxt" for the package's own method or "spiderbar"
#' for using spiderbar::can_fetch; note that at the current
#' state spiderbar is considered less accurate: the spiderbar
#' algorithm will only take into consideration rules for *
#' or a particular bot but does not merge rules together
#' (see: \code{paste0(system.file("robotstxts", package = "robotstxt"),"/selfhtml_Example.txt")})
#' @param check_method at the moment only kept for backward compatibility reasons - do not use parameter anymore --> will let the function simply use the default
#' @param robotstxt_list either NULL -- the default -- or a list of character
#' vectors with one vector per path to check
#'
#' @inheritParams get_robotstxt
#' @inheritParams get_robotstxts
#'
#' @seealso \link{path_allowed}
#'
#' @export
paths_allowed <-
Expand All @@ -29,7 +22,7 @@ paths_allowed <-
domain = "auto",
bot = "*",
user_agent = utils::sessionInfo()$R.version$version.string,
check_method = c("robotstxt", "spiderbar"),
check_method = c("spiderbar"),
warn = TRUE,
force = FALSE,
ssl_verifypeer = c(1,0),
Expand Down Expand Up @@ -62,44 +55,24 @@ paths_allowed <-
}

# check paths
if ( check_method[1] == "robotstxt"){
warning(
"
This check method is deprecated,
please stop using it -
use 'spiderbar' instead
or do not specify check_method parameter at all.
"
)
}
res <-
if ( check_method[1] == "spiderbar"){

paths_allowed_worker_spiderbar(
domain = domain,
bot = bot,
paths = paths,
robotstxt_list = robotstxt_list
)

} else {

if( use_futures ){
permissions_list <-
future::future_lapply(
robotstxt_list,
function(x){robotstxt(text=x)$permissions}
)

}else{
permissions_list <-
lapply(
robotstxt_list,
function(x){robotstxt(text=x)$permissions}
)

}

paths_allowed_worker_robotstxt(
domain = domain,
bot = bot,
paths = paths,
permissions_list = permissions_list
)

}


# return
return(res)
}
Expand Down
40 changes: 0 additions & 40 deletions R/paths_allowed_worker_robotstxt.R

This file was deleted.

14 changes: 0 additions & 14 deletions R/paths_allowed_worker_spiderbar.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,6 @@ paths_allowed_worker_spiderbar <-
path = path,
user_agent = bot
)

# star_can_fetch <-
# spiderbar::can_fetch(
# obj = rtxt_obj,
# path = path,
# user_agent = "*"
# )

# return
# if ( star_can_fetch == FALSE ) {
# return(FALSE)
# }else{
# return(bot_can_fetch)
# }
}
return(bot_can_fetch)
}
Expand Down
7 changes: 6 additions & 1 deletion R/robotstxt.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,15 @@ robotstxt <-
self$host <- tmp$host
self$sitemap <- tmp$sitemap
self$other <- tmp$other
self$robexclobj <- spiderbar::robxp(self$text)

self$check <-
function(paths="/", bot="*"){
sapply(paths, path_allowed, permissions=self$permissions, bot=bot)
spiderbar::can_fetch(
obj = self$robexclobj,
path = paths,
user_agent = bot
)
}

# return
Expand Down
15 changes: 0 additions & 15 deletions R/sanitize_permission_values.R

This file was deleted.

16 changes: 0 additions & 16 deletions R/sanitize_permissions.R

This file was deleted.

29 changes: 0 additions & 29 deletions man/path_allowed.Rd

This file was deleted.

15 changes: 3 additions & 12 deletions man/paths_allowed.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 476bcab

Please sign in to comment.