Skip to content

Commit

Permalink
fixed styles, new functions to view tables, more modular functions
Browse files Browse the repository at this point in the history
  • Loading branch information
cecoeco committed May 6, 2024
1 parent bda594a commit 94a4e69
Show file tree
Hide file tree
Showing 19 changed files with 400 additions and 194 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "HTMLTables"
uuid = "b1afcece-b80e-4563-b90e-36b4cc56d3fa"
authors = ["Ceco E. Maples <[email protected]>"]
version = "0.3.0"
version = "0.3.1"

[deps]
Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f"
Expand Down
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
# HTMLTables

Julia package for reading and writing HTML tables.
Julia package for reading, writing, and viewing HTML tables.

## Reading HTML tables
- `HTMLTables.read`: extracts data from HTML tables.
Reading HTML tables:
- `HTMLTables.get` reads an HTML table as a string.
- `HTMLTables.read` extracts data from HTML tables.

## Writing HTML tables
- `HTMLTables.write`: uses the Tables.jl interface to write an HTML table.
Writing HTML tables:
- `HTMLTables.table` uses the Tables.jl interface to write an HTML table as a string.
- `HTMLTables.write` uses the Tables.jl interface to write an HTML table in a file.

Viewing HTML tables:
- `HTMLTables.display` displays a julia table as an HTML table in julia.
- `HTMLTables.open` opens a julia table as an HTML table in the browser.

## License
Copyright © 2024 Ceco Elijah Maples
Expand Down
18 changes: 15 additions & 3 deletions src/HTMLTables.jl
Original file line number Diff line number Diff line change
@@ -1,21 +1,33 @@
"""
HTMLTables
Julia package for reading and writing HTML tables.
Julia package for reading, writing, and viewing HTML tables.
Reading HTML tables:
- `HTMLTables.get` reads an HTML table as a string.
- `HTMLTables.read` extracts data from HTML tables.
Writing HTML tables:
- `HTMLTables.write` uses the Tables.jl interface to write an HTML table.
- `HTMLTables.table` uses the Tables.jl interface to write an HTML table as a string.
- `HTMLTables.write` uses the Tables.jl interface to write an HTML table in a file.
Viewing HTML tables:
- `HTMLTables.display` displays a julia table as an HTML table in julia.
- `HTMLTables.open` opens a julia table as an HTML table in the browser.
"""
module HTMLTables

using Cascadia, Colors, ColorSchemes, Gumbo, HTTP, Tables

export read, write
export get, read, table, write, display, open

include("get.jl")
include("read.jl")

include("table.jl")
include("write.jl")

include("display.jl")
include("open.jl")

end
10 changes: 10 additions & 0 deletions src/display.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
HTMLTables.display(tbl; kwargs...)
Displays a julia table as an HTML table in julia.
"""
function display(tbl; kwargs...)
html_table::String = table(tbl; kwargs...)

Base.display("image/svg+xml", html_table)
end
65 changes: 65 additions & 0 deletions src/get.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
function isurl(source::String)::Bool
url_pattern::Regex = r"(?i)\b((?:https?|ftp):\/\/[\w-]+(\.[\w-]+)+([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)\b"

return Base.occursin(url_pattern, source)
end

"""
HTMLTables.get(source::String; id::String="", classes::Union{Vector{String},String}="", index::Int=1)
Returns an HTML table a source as a string.
## Arguments
- `source::String`: URL or path to the HTML table.
- `id::String`: The id of the HTML table.
- `classes::Union{Vector{String},String}`: The classes of the HTML table.
- `index::Int`: The index of the HTML table in the HTML document.
"""
function get(
source::String;
id::String="",
classes::Union{Vector{String},String}="",
index::Int=1
)
if index <= 0
Base.throw(Base.ArgumentError("Index must be a positive integer"))
end

if isurl(source) == true
response::HTTP.Response = HTTP.get(source)
html_content = Base.String(response.body)
else
html_content = Base.read(source, String)
end

html_document::Gumbo.HTMLDocument = Gumbo.parsehtml(html_content)

selector::String = ""
if Base.isempty(id)
if Base.isempty(classes)
selector *= "table"
elseif !Base.isempty(classes) && Base.isa(classes, String)
selector *= "table.$classes"
elseif !Base.isempty(classes) && Base.isa(classes, Vector{String})
selector *= "table." * Base.join(classes, ".")
end
elseif !Base.isempty(id)
selector *= "#$id"
end

tables::Vector{Gumbo.HTMLNode} = Base.eachmatch(Cascadia.Selector(selector), html_document.root)

number_of_tables::Int = Base.length(tables)

if number_of_tables == 0
Base.throw(Base.ArgumentError("No HTML tables found"))
elseif index > number_of_tables
Base.throw(Base.ArgumentError("$number_of_tables table(s) found. Index $index does not exist."))
end

table::Gumbo.HTMLNode = tables[index]

return table
end
16 changes: 16 additions & 0 deletions src/open.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""
HTMLTables.open(tbl; kwargs...)
Opens a julia table as an HTML table in the browser.
"""
function open(tbl; kwargs...)
path::String = write(tbl; kwargs...)

if Base.Sys.iswindows()
Base.run(`start $path`)
elseif Base.Sys.islinux()
Base.run(`xdg-open $path`)
elseif Base.Sys.isapple()
Base.run(`open $path`)
end
end
49 changes: 4 additions & 45 deletions src/read.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,7 @@
function isurl(source::String)::Bool
url_pattern ::Regex = r"(?i)\b((?:https?|ftp):\/\/[\w-]+(\.[\w-]+)+([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)\b"

return Base.occursin(url_pattern, source)
end

function extractrowdata(row::Gumbo.HTMLNode)::Vector
cell_matches::Vector{Gumbo.HTMLNode} = Base.eachmatch(
Cascadia.Selector("td,th"), row
)
cells::Vector{Gumbo.HTMLNode} = Base.eachmatch(Cascadia.Selector("td,th"), row)

return [Cascadia.nodeText(cell) for cell in cell_matches]
return [Cascadia.nodeText(cell) for cell in cells]
end

"""
Expand All @@ -23,6 +15,7 @@ Reads a HTML table into a sink function such as `DataFrame`.
- `sink`: The function that materializes the table data.
- `id::String`: The id of the HTML table.
- `classes::Union{Vector{String},String}`: The classes of the HTML table.
- `index::Int`: The index of the HTML table in the HTML document.
## Examples
Expand All @@ -31,8 +24,6 @@ using HTMLTables, DataFrames
# read an HTML table into a DataFrame
df = HTMLTables.read("https://www.w3schools.com/html/html_tables.asp", DataFrame)
println(df)
```
"""
function read(
Expand All @@ -42,39 +33,7 @@ function read(
classes::Union{Vector{String},String}="",
index::Int=1
)
if index <= 0
throw(ArgumentError("Index must be a positive integer"))
end

if isurl(source) == true
response::HTTP.Response = HTTP.get(source)
html_content = Base.String(response.body)
else
html_content = Base.read(source, String)
end

html_document::Gumbo.HTMLDocument = Gumbo.parsehtml(html_content)

selector::String = ""
if Base.isempty(id)
if Base.isempty(classes)
selector *= "table"
elseif !Base.isempty(classes) && Base.isa(classes, String)
selector *= "table.$classes"
elseif !Base.isempty(classes) && Base.isa(classes, Vector{String})
selector *= "table." * Base.join(classes, ".")
end
elseif !Base.isempty(id)
selector *= "#$id"
end

tables::Vector{Gumbo.HTMLNode} = Base.eachmatch(Cascadia.Selector(selector), html_document.root)

if Base.isempty(tables) == true
throw(ArgumentError("No HTML tables found"))
end

table::Gumbo.HTMLNode = tables[index]
table = get(source, id=id, classes=classes, index=index)

rows::Vector{Gumbo.HTMLNode} = Base.eachmatch(Cascadia.Selector("tr"), table)
headers::Vector = []
Expand Down
Loading

0 comments on commit 94a4e69

Please sign in to comment.