diff --git a/R/known_tags.R b/R/known_tags.R index b225100e..4f298f66 100644 --- a/R/known_tags.R +++ b/R/known_tags.R @@ -27,8 +27,8 @@ known_tags <- c( "code", # html "col", # html "colgroup", # html - "color-profile", # svg - "command", # + "color-profile", # deprecated + "command", # deprecated "data", # html "datalist", # html "dd", # html @@ -45,7 +45,7 @@ known_tags <- c( "ellipse", # svg "em", # html "embed", # html - "eventsource", # + "eventsource", # deprecated "feBlend", # svg "feColorMatrix", # svg "feComponentTransfer",# svg @@ -89,7 +89,7 @@ known_tags <- c( "hatchpath", # svg "head", # html "header", # html - "hgroup", # html + "hgroup", # deprecated "hr", # html "html", # html "i", # html @@ -99,7 +99,7 @@ known_tags <- c( "input", # html "ins", # html "kbd", # html - "keygen", # + "keygen", # deprecated "label", # html "legend", # html "li", # html @@ -111,6 +111,7 @@ known_tags <- c( "mark", # html "marker", # svg "mask", # svg + "math", # html "menu", # html "meta", # html "metadata", # svg @@ -130,15 +131,16 @@ known_tags <- c( "picture", # html "polygon", # svg "polyline", # svg + "portal", # html "pre", # html "progress", # html "q", # html "radialGradient", # svg - "rb", # html + "rb", # deprecated "rect", # svg "rp", # html "rt", # html - "rtc", # html + "rtc", # deprecated "ruby", # html "s", # html "samp", # html @@ -148,7 +150,7 @@ known_tags <- c( "set", # svg "slot", # html "small", # html - "solidcolor", # svg + "solidcolor", # deprecated "source", # html "span", # html "stop", # svg @@ -157,7 +159,7 @@ known_tags <- c( "sub", # html "summary", # html "sup", # html - "svg", # svg + "svg", # html svg "switch", # svg "symbol", # svg "table", # html diff --git a/scripts/generate_known_tags.R b/scripts/generate_known_tags.R index 96d82cac..6a815517 100644 --- a/scripts/generate_known_tags.R +++ b/scripts/generate_known_tags.R @@ -3,47 +3,77 @@ ## This script web scrapes two Mozilla websites for HTML and SVG tag elements. ## All HTML tags +library(rvest) +library(dplyr) + +# Note: Mozilla seems to have a more up to date set of what is possible / not obsolete compared to W3 schools +base_url <- "https://developer.mozilla.org/en-US/docs/Web" + +html_tag_dfs <- read_html(file.path(base_url, "HTML", "Element")) %>% + html_table() + +# The last table is obsolete/deprecated elements +n_dfs <- length(html_tag_dfs) + +html_tags_df <- html_tag_dfs[-n_dfs] %>% + bind_rows() %>% + # h1-h6 all appear in one comma-separated row + mutate(name = strsplit(Element, ", ")) %>% + tidyr::unnest(name) %>% + select(Element = name, Description) %>% + transmute( + name = sub("^<", "", sub(">$", "", Element)), + desc = paste( + Description, "Learn more at", + file.path(base_url, "HTML", "Element", name) + ) + ) + +svg <- read_html(file.path(base_url, "SVG", "Element")) + +# Due to a lack of structure on the SVG page, +# this seems to be the best way to target just +# the hyperlinks under the "SVG elements A to Z" section +svg_tags <- lapply(letters, function(x) { + html_elements(svg, sprintf("h3[id=%s] + div > ul > li > a", x)) %>% + html_attr("href") %>% + basename() +}) + +# TODO: evenetually it might be nice to also scrape +# the descriptions by following the url +svg_tags_df <- tibble( + name = unlist(svg_tags), + desc = sprintf( + "Creates the <%s> SVG element. Learn more at %s", + name, file.path(base_url, "SVG", "Element", name) + ) +) -library(magrittr) - - -get_tags <- function(url, css) { - url %>% - httr::GET() %>% - httr::content() %>% - rvest::html_nodes(css) %>% - rvest::html_text() %>% - sub("^<", "", .) %>% - sub(">$", "", .) %>% - sort() %>% - unique() %>% - print() -} - -## W3 Schools -## Mozilla seemed to have a more up to date set of what is possible / not obsolete -# w3html_tags <- get_tags("https://www.w3schools.com/tags/default.asp", "#htmltags tr td:first-child a:not(.notsupported)") -## Had extra tags not seen in other places `altGlyph` -# w3svg_tags <- get_tags("https://www.w3schools.com/graphics/svg_reference.asp", "#main td:first-child") - -## W3 Standard -# # The original spec websites made it very hard to determine what was obsolete / shouldn't be used and what was to be used -# html_tags <- get_tags("https://www.w3.org/TR/2018/WD-html53-20181018/single-page.html", "dfn[data-dfn-type='element']") -# svg_tags <- get_tags("https://svgwg.org/svg2-draft/single-page.html", "dfn[data-dfn-type='element']") +# Save a JSON version so other languages can read them in easily +cat( + jsonlite::toJSON(html_tags_df), + file = "scripts/html_tags.json" +) -## Mozilla -# do not include the last section of obsolete tags -html_tags <- get_tags("https://developer.mozilla.org/en-US/docs/Web/HTML/Element", "article table:not(:last-child) td:first-child code") -# html_tags_obsolete <- get_tags("https://developer.mozilla.org/en-US/docs/Web/HTML/Element", "#content table:last-child td:first-child a") +cat( + jsonlite::toJSON(svg_tags_df), + file = "scripts/svg_tags.json" +) -# do not include tags that do not contain documentation articles -# Only pull from the index, as elements not in the index are considered obsolete. (ex: altGlyph or font-face) -svg_tags <- get_tags("https://developer.mozilla.org/en-US/docs/Web/SVG/Element", "article .index a:not([rel='nofollow']) code") +html_tags <- html_tags_df$name +svg_tags <- svg_tags_df$name # Both SVG2 and HTML5 svg_tags[svg_tags %in% html_tags] +#> [1] "a" "script" "style" "svg" "title" + + +new_tags <- c(svg_tags, html_tags) %>% + unique() %>% + sort() # Call using callr::r to avoid any devtools loaded htmltools::tags namespace issues cran_tags <- callr::r( @@ -54,27 +84,43 @@ cran_tags <- callr::r( show = TRUE ) -new_tags <- c(svg_tags, html_tags) %>% unique() %>% sort() - # tags which should not HTML5 / SVG2 supported setdiff(cran_tags, new_tags) -#> "command" "eventsource" "keygen" +#> [1] "color-profile" "command" "eventsource" "hgroup" +#> [5] "keygen" "rb" "rtc" "solidcolor" # New HTML5 tags setdiff(html_tags, cran_tags) -#> "rb" "rtc" "slot" +#> "portal" "math" + # New SVG2 tags setdiff(svg_tags, cran_tags) -### ...basically all svg tags +#> character(0) # combine old and new tags so that old tags are not lost -save_tags <- c(new_tags, cran_tags) %>% unique() %>% sort() +save_tags <- c(new_tags, cran_tags) %>% + unique() %>% + sort() save_line <- paste0( - format(paste0(" \"", save_tags, "\"", ifelse(seq_along(save_tags) == length(save_tags), "", ",")), justify = "left"), "#", - ifelse(save_tags %in% html_tags, " html", " "), - ifelse(save_tags %in% svg_tags, " svg", "") + format( + paste0( + " \"", save_tags, "\"", + ifelse( + seq_along(save_tags) == length(save_tags), + "", "," + ) + ), + justify = "left" + ), + "#", + case_when( + save_tags %in% html_tags & save_tags %in% svg_tags ~ " html svg", + save_tags %in% html_tags ~ " html", + save_tags %in% svg_tags ~ " svg", + TRUE ~ " deprecated" + ) ) %>% sub("\\s+$", "", .) cat( diff --git a/scripts/html_tags.json b/scripts/html_tags.json new file mode 100644 index 00000000..0ffaf301 --- /dev/null +++ b/scripts/html_tags.json @@ -0,0 +1 @@ +[{"name":"html","desc":"The HTML element represents the root (top-level element) of an HTML document, so it is also referred to as the root element. All other elements must be descendants of this element. Learn more at https://developer.mozilla.org/en-US/docs/Web/HTML/Element/html"},{"name":"base","desc":"The HTML element specifies the base URL to use for all relative URLs in a document. There can be only one element in a document. Learn more at https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base"},{"name":"head","desc":"The HTML element contains machine-readable information (metadata) about the document, like its title, scripts, and style sheets. Learn more at https://developer.mozilla.org/en-US/docs/Web/HTML/Element/head"},{"name":"link","desc":"The HTML element specifies relationships between the current document and an external resource.\n This element is most commonly used to link to CSS, but is also used to establish site icons (both \"favicon\" style icons and icons for the home screen and apps on mobile devices) among other things. Learn more at https://developer.mozilla.org/en-US/docs/Web/HTML/Element/link"},{"name":"meta","desc":"The HTML element represents Metadata that cannot be represented by other HTML meta-related elements, like base, link, script, style or title. Learn more at https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta"},{"name":"style","desc":"The