diff --git a/NAMESPACE b/NAMESPACE index 36d9b92..b6fb459 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ export(jsonpointer) export(version) importFrom(cli,cli_progress_bar) importFrom(cli,cli_progress_done) +importFrom(tools,file_ext) importFrom(utils,head) importFrom(utils,tail) useDynLib(rjsoncons, .registration = TRUE) diff --git a/R/j_data_type.R b/R/j_data_type.R index e0d5832..f5f54df 100644 --- a/R/j_data_type.R +++ b/R/j_data_type.R @@ -40,6 +40,8 @@ #' #' @inheritParams j_query #' +#' @importFrom tools file_ext +#' #' @details #' #' `j_data_type()` without any arguments reports possible return @@ -49,10 +51,13 @@ #' #' - For a scalar (length 1) character `data`, either `"url"` #' (matching regular expression `"^https?://"`, `"file"` -#' (`file.exists(data)` returns `TRUE`), or `"json"`. When `"file"` -#' or `"url"` is inferred, the return value is a length 2 vector, -#' with the first element the inferred type of data (`"json"` or -#' `"ndjson"`) obtained from the first 2 lines of the file. +#' (`file.exists(data)` returns `TRUE`), or `"json"`. Compression +#' type is accounted for and supported compression extensions are +#' `".gz"`, `".bz2"`, and `".xz"`. `".zip"` is not supported and +#' will return an error. When `"file"` or `"url"` is inferred, +#' the return value is a length 2 vector, with the first element +#' the inferred type of data (`"json"` or `"ndjson"`) obtained +#' from the first 2 lines of the file. #' - For character data with `length(data) > 1`, `"ndjson"` if all #' elements start a square bracket or curly brace, consistently #' (i.e., agreeing with the start of the first record), otherwise @@ -91,10 +96,24 @@ j_data_type <- ) if (length(data) == 1L) { - ## url or file path or json - if (length(grep("^https?://", data))) { + is_url <- grepl("^https?://", data, TRUE) + is_file <- file.exists(data) + ## compressed and uncompressed url or file path + if (file_ext(data) %in% c("gz", "bz2", "xz")) { + format <- + if (grepl("\\.ndjson\\.(gz|bz2|xz)$", data, TRUE)) + "ndjson" + else if (grepl("\\.json\\.(gz|bz2|xz)$", data, TRUE)) + "json" + c( + format, + if (is_url) "url" else if (is_file) "file" + ) + } else if (identical(file_ext(data), "zip")) { + stop("`j_data_type()` of type `.zip` is not supported") + } else if (is_url) { c(j_data_type(readLines(data, 2L, warn = FALSE)), "url") - } else if (file.exists(data)) { + } else if (is_file) { c(j_data_type(readLines(data, 2L, warn = FALSE)), "file") } else if (.is_scalar_character(data) && !inherits(data, "AsIs")) { "json" diff --git a/man/j_data_type.Rd b/man/j_data_type.Rd index 6dd3b15..087ddaf 100644 --- a/man/j_data_type.Rd +++ b/man/j_data_type.Rd @@ -33,10 +33,13 @@ validate) the type of \code{data} based on the following rules: \itemize{ \item For a scalar (length 1) character \code{data}, either \code{"url"} (matching regular expression \code{"^https?://"}, \code{"file"} -(\code{file.exists(data)} returns \code{TRUE}), or \code{"json"}. When \code{"file"} -or \code{"url"} is inferred, the return value is a length 2 vector, -with the first element the inferred type of data (\code{"json"} or -\code{"ndjson"}) obtained from the first 2 lines of the file. +(\code{file.exists(data)} returns \code{TRUE}), or \code{"json"}. Compression +type is accounted for and supported compression extensions are +\code{".gz"}, \code{".bz2"}, and \code{".xz"}. \code{".zip"} is not supported and +will return an error. When \code{"file"} or \code{"url"} is inferred, +the return value is a length 2 vector, with the first element +the inferred type of data (\code{"json"} or \code{"ndjson"}) obtained +from the first 2 lines of the file. \item For character data with \code{length(data) > 1}, \code{"ndjson"} if all elements start a square bracket or curly brace, consistently (i.e., agreeing with the start of the first record), otherwise