From 8063a8aa3cf7edbb6a07af6c2236277a346d7e8b Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Thu, 3 Oct 2024 23:27:12 +0100 Subject: [PATCH 01/18] Remove the gc call in read.jl To make this work, I had to set the `enable_cache` kwarg to `true` rather than false for `readdata()` and `readtable()` --- src/read.jl | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/src/read.jl b/src/read.jl index 9f3e304..a97c2d4 100644 --- a/src/read.jl +++ b/src/read.jl @@ -147,8 +147,6 @@ function openxlsx(f::F, source::Union{AbstractString, IO}; close(xf) end - # fix libuv issue on windows (#42) and other systems (#173) - GC.gc() end end @@ -203,9 +201,8 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, continue end - # Rather than ignore custom XML internal files here, let them get passed through to write like binaries are. + # let customXML files get passed through to write like binaries are (below). if !startswith(f.name, "customXml") && (endswith(f.name, ".xml") || endswith(f.name, ".rels")) - #if endswith(f.name, ".xml") || endswith(f.name, ".rels") # XML file internal_xml_file_add!(xf, f.name) @@ -217,19 +214,12 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, continue end - # ignore custom XML internal files - # no longer needed if these files are passed through like binary files - #if startswith(f.name, "customXml") - # continue - #end - internal_xml_file_read(xf, f.name) end - elseif read_as_template - # Binary file - # we only read binary files to save the Excel file later - # Custom XML files also now get passed through this way, too + elseif read_as_template + # Binary and customXML files + # we only read these files to save the Excel file later bytes = ZipFile.read(f) @assert sizeof(bytes) == f.uncompressedsize xf.binary_data[f.name] = bytes @@ -530,14 +520,13 @@ julia> XLSX.readdata("myfile.xlsx", "mysheet!A2:B4") ``` """ function readdata(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, ref) - c = openxlsx(source, enable_cache=false) do xf + c = openxlsx(source, enable_cache=true) do xf getdata(getsheet(xf, sheet), ref) end return c end - function readdata(source::Union{AbstractString, IO}, sheetref::AbstractString) - c = openxlsx(source, enable_cache=false) do xf + c = openxlsx(source, enable_cache=true) do xf getdata(xf, sheetref) end return c @@ -610,14 +599,14 @@ julia> df = DataFrame(XLSX.readtable("myfile.xlsx", "mysheet")) See also: [`XLSX.gettable`](@ref). """ -function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false, keep_empty_rows::Bool=false) +function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=true, keep_empty_rows::Bool=false) c = openxlsx(source, enable_cache=enable_cache) do xf gettable(getsheet(xf, sheet); first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows) end return c end -function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, columns::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false, keep_empty_rows::Bool=false) +function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, columns::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=true, keep_empty_rows::Bool=false) c = openxlsx(source, enable_cache=enable_cache) do xf gettable(getsheet(xf, sheet), columns; first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows) end From 7ccb8879d2290e004e0cdba8643a53f3002233fa Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Thu, 3 Oct 2024 23:31:54 +0100 Subject: [PATCH 02/18] Added tests for rm after readdata and readtable These tests will fail with `enable_cache=false` for both `readdata()` and `readtable()` (as in the current master). This PR changes this kwarg for these functions to `true`. --- test/runtests.jl | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 1eda966..87653f7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1354,14 +1354,21 @@ end @test dt_read.column_labels == dt.column_labels @test dt_read.column_label_index == dt.column_label_index end +end - # delete files created by this testset - delete_files = ["output_table.xlsx", "output_tables.xlsx"] - for f in delete_files - isfile(f) && rm(f) +@testset "rm after read" begin + @testset "with readtable" begin + f = "output_table.xlsx" + dtable = XLSX.readtable(f, "report", "F") + @test isfile(f) && rm(f)==nothing + end + @testset "with readdata" begin + f = "output_tables.xlsx" + dtable = XLSX.readdata(f, "REPORT_A", "A1:B3") + @test isfile(f) && rm(f)==nothing end end - + @testset "Styles" begin using XLSX: CellValue, id, getcell, setdata!, CellRef From 3dc95bce7d00a0bbde892452d059b5defd7ca633 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:18:17 +0100 Subject: [PATCH 03/18] Remove remaining dependence on ZipFiles --- src/read.jl | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/read.jl b/src/read.jl index a97c2d4..8ba0650 100644 --- a/src/read.jl +++ b/src/read.jl @@ -132,7 +132,7 @@ function openxlsx(f::F, source::Union{AbstractString, IO}; if _read @assert source isa IO || isfile(source) "File $source not found." - xf = open_or_read_xlsx(source, _write, enable_cache, _write) + xf = open_or_read_xlsx(source, _read, enable_cache, _write) else xf = open_empty_template() end @@ -167,7 +167,7 @@ function openxlsx(source::Union{AbstractString, IO}; if _read @assert source isa IO || isfile(source) "File $source not found." - return open_or_read_xlsx(source, _write, enable_cache, _write) + return open_or_read_xlsx(source, _read, enable_cache, _write) else return open_empty_template() end @@ -194,35 +194,35 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, xf = XLSXFile(source, enable_cache, read_as_template) try - for f in xf.io.files + for (i, f) in enumerate(ZipArchives.zip_names(xf.io)) # ignore xl/calcChain.xml in any case (#31) - if f.name == "xl/calcChain.xml" + if f == "xl/calcChain.xml" continue end # let customXML files get passed through to write like binaries are (below). - if !startswith(f.name, "customXml") && (endswith(f.name, ".xml") || endswith(f.name, ".rels")) + if !startswith(f, "customXml") && (endswith(f, ".xml") || endswith(f, ".rels")) # XML file - internal_xml_file_add!(xf, f.name) + internal_xml_file_add!(xf, f) if read_files # ignore worksheet files because they'll be read thru streaming # If reading as template, it will be loaded in two places: here and WorksheetCache. - if !read_as_template && startswith(f.name, "xl/worksheets") && endswith(f.name, ".xml") + if !read_as_template && startswith(f, "xl/worksheets") && endswith(f, ".xml") continue end - internal_xml_file_read(xf, f.name) + internal_xml_file_read(xf, f) end elseif read_as_template # Binary and customXML files # we only read these files to save the Excel file later - bytes = ZipFile.read(f) - @assert sizeof(bytes) == f.uncompressedsize - xf.binary_data[f.name] = bytes + bytes = read(IOBuffer(ZipArchives.zip_readentry(xf.io, f, String))) + @assert sizeof(bytes) == ZipArchives.zip_uncompressed_size(xf.io, i) + xf.binary_data[f] = bytes end end @@ -250,8 +250,10 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, end finally - if read_files + if read_files # Always close(xf) + else # Never + println("read_files isn't true!") end end @@ -436,12 +438,12 @@ function internal_xml_file_read(xf::XLSXFile, filename::String) :: EzXML.Documen if !internal_xml_file_isread(xf, filename) @assert isopen(xf) "Can't read from a closed XLSXFile." file_not_found = true - for f in xf.io.files - if f.name == filename + for f in ZipArchives.zip_names(xf.io) + if f == filename xf.files[filename] = true # set file as read try - xf.data[filename] = EzXML.readxml(f) + xf.data[filename] = EzXML.parsexml(ZipArchives.zip_readentry(xf.io, f, String)) catch err @error("Failed to parse internal XML file `$filename`") rethrow() @@ -463,8 +465,7 @@ end function Base.close(xl::XLSXFile) xl.io_is_open = false - close(xl.io) - +# close(xl.io) # close all internal file streams from worksheet caches for sheet in xl.workbook.sheets if sheet.cache != nothing && sheet.cache.stream_state != nothing From fc3f8b16cf21d6c15b2e9d1405e525440978d6f1 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:20:00 +0100 Subject: [PATCH 04/18] Remove remaining dependency on ZipFiles --- src/types.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/types.jl b/src/types.jl index 0651d36..05b8b96 100644 --- a/src/types.jl +++ b/src/types.jl @@ -284,7 +284,7 @@ sh = xf["mysheet"] # get a reference to a Worksheet mutable struct XLSXFile <: MSOfficePackage source::Union{AbstractString, IO} use_cache_for_sheet_data::Bool # indicates whether Worksheet.cache will be fed while reading worksheet cells. - io::ZipFile.Reader + io::ZipArchives.ZipReader io_is_open::Bool files::Dict{String, Bool} # maps filename => isread bool data::Dict{String, EzXML.Document} # maps filename => XMLDocument @@ -295,7 +295,7 @@ mutable struct XLSXFile <: MSOfficePackage function XLSXFile(source::Union{AbstractString, IO}, use_cache::Bool, is_writable::Bool) check_for_xlsx_file_format(source) - io = ZipFile.Reader(source) + io = ZipArchives.ZipReader(read(source)) xl = new(source, use_cache, io, true, Dict{String, Bool}(), Dict{String, EzXML.Document}(), Dict{String, Vector{UInt8}}(), EmptyWorkbook(), Vector{Relationship}(), is_writable) xl.workbook.package = xl finalizer(close, xl) From 281d5c11dab10b0ebda7e85666c3bcc5564b9586 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:22:05 +0100 Subject: [PATCH 05/18] Remove any remaining dependence on ZipFile --- src/XLSX.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/XLSX.jl b/src/XLSX.jl index eae3ce9..9e6b199 100644 --- a/src/XLSX.jl +++ b/src/XLSX.jl @@ -4,7 +4,6 @@ module XLSX import Artifacts import Dates import Printf.@printf -import ZipFile import ZipArchives import EzXML import Tables @@ -34,4 +33,6 @@ include("cell.jl") include("styles.jl") include("write.jl") + + end # module XLSX From 7343cabde7ac953aa3446ed47336ecd540467ec5 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:30:16 +0100 Subject: [PATCH 06/18] Remove last trace of ZipFile --- src/XLSX.jl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/XLSX.jl b/src/XLSX.jl index 9e6b199..feac92d 100644 --- a/src/XLSX.jl +++ b/src/XLSX.jl @@ -9,12 +9,7 @@ import EzXML import Tables import Base.convert -# /~https://github.com/fhs/ZipFile.jl/issues/39 -if !hasmethod(Base.bytesavailable, Tuple{ZipFile.ReadableFile}) - Base.bytesavailable(f::ZipFile.ReadableFile) = f.uncompressedsize - f._pos -end - -const SPREADSHEET_NAMESPACE_XPATH_ARG = [ "xpath" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main" ] +const SPREADSHEET_NAMESPACE_XPATH_ARG = ["xpath" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main"] const EXCEL_MAX_COLS = 16_384 # total columns supported by Excel per sheet const EXCEL_MAX_ROWS = 1_048_576 # total rows supported by Excel per sheet (including headers) From e0979b3ea72deb7e63b749e33560940b2e58e193 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:34:50 +0100 Subject: [PATCH 07/18] Remove dependency on ZipFile --- Project.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Project.toml b/Project.toml index f8401d5..a725e49 100644 --- a/Project.toml +++ b/Project.toml @@ -12,13 +12,11 @@ EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c" -ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [compat] EzXML = "1" Tables = "1" ZipArchives = "2" -ZipFile = "0.8, 0.9, 0.10" julia = "1.6" [extras] From b189ea11496faac5c2ffe76f35a360309cbb7ea0 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:46:24 +0100 Subject: [PATCH 08/18] Update types.jl --- src/types.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/types.jl b/src/types.jl index 05b8b96..09f6c32 100644 --- a/src/types.jl +++ b/src/types.jl @@ -194,7 +194,7 @@ Implementations: SheetRowStreamIterator, WorksheetCache. abstract type SheetRowIterator end mutable struct SheetRowStreamIteratorState - zip_io::ZipFile.Reader + zip_io::ZipArchives.ZipReader xml_stream_reader::EzXML.StreamReader is_open::Bool # indicated if zip_io and xml_stream_reader are opened row::Int # number of current row. It´s set to 0 in the start state. From d59aa6933151f76fbabc9cc09696db80edbb5a99 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:40:01 +0100 Subject: [PATCH 09/18] Remove ZipFile --- src/XLSX.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/XLSX.jl b/src/XLSX.jl index feac92d..d45e3ef 100644 --- a/src/XLSX.jl +++ b/src/XLSX.jl @@ -9,7 +9,7 @@ import EzXML import Tables import Base.convert -const SPREADSHEET_NAMESPACE_XPATH_ARG = ["xpath" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main"] +const SPREADSHEET_NAMESPACE_XPATH_ARG = [ "xpath" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main" ] const EXCEL_MAX_COLS = 16_384 # total columns supported by Excel per sheet const EXCEL_MAX_ROWS = 1_048_576 # total rows supported by Excel per sheet (including headers) @@ -28,6 +28,4 @@ include("cell.jl") include("styles.jl") include("write.jl") - - end # module XLSX From 55b570b655108919355598ca1e6b79160751a068 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:40:55 +0100 Subject: [PATCH 10/18] Remove ZipFile and gc call on read --- src/read.jl | 617 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 617 insertions(+) diff --git a/src/read.jl b/src/read.jl index 8ba0650..50de817 100644 --- a/src/read.jl +++ b/src/read.jl @@ -123,6 +123,623 @@ XLSX.openxlsx("edit.xlsx", mode="rw") do xf end ``` +See also [`XLSX.readxlsx`](@ref). +""" +function openxlsx(f::F, source::Union{AbstractString, IO}; + mode::AbstractString="r", enable_cache::Bool=true) where {F<:Function} + + _read, _write = parse_file_mode(mode) + + if _read + @assert source isa IO || isfile(source) "File $source not found." + + xf = open_or_read_xlsx(source, _read, enable_cache, _write) + else + xf = open_empty_template() + end + + + try + f(xf) + finally + + if _write + writexlsx(source, xf, overwrite=true) + else + close(xf) + end + + end +end + +""" + openxlsx(source::Union{AbstractString, IO}; mode="r", enable_cache=true) :: XLSXFile + +Supports opening a XLSX file without using do-syntax. +In this case, the user is responsible for closing the `XLSXFile` +using `close` or writing it to file using `XLSX.writexlsx`. + +See also [`XLSX.writexlsx`](@ref). +""" +function openxlsx(source::Union{AbstractString, IO}; + mode::AbstractString="r", + enable_cache::Bool=true) :: XLSXFile + + _read, _write = parse_file_mode(mode) + + if _read + @assert source isa IO || isfile(source) "File $source not found." + return open_or_read_xlsx(source, _read, enable_cache, _write) + else + return open_empty_template() + end +end + +function parse_file_mode(mode::AbstractString) :: Tuple{Bool, Bool} + if mode == "r" + return (true, false) + elseif mode == "w" + return (false, true) + elseif mode == "rw" || mode == "wr" + return (true, true) + else + error("Couldn't parse file mode $mode.") + end +end + +function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, enable_cache::Bool, read_as_template::Bool) :: XLSXFile + # sanity check + if read_as_template + @assert read_files && enable_cache + end + + xf = XLSXFile(source, enable_cache, read_as_template) + + try + for (i, f) in enumerate(ZipArchives.zip_names(xf.io)) + + # ignore xl/calcChain.xml in any case (#31) + if f == "xl/calcChain.xml" + continue + end + + # let customXML files get passed through to write like binaries are (below). + if !startswith(f, "customXml") && (endswith(f, ".xml") || endswith(f, ".rels")) + + # XML file + internal_xml_file_add!(xf, f) + if read_files + + # ignore worksheet files because they'll be read thru streaming + # If reading as template, it will be loaded in two places: here and WorksheetCache. + if !read_as_template && startswith(f, "xl/worksheets") && endswith(f, ".xml") + continue + end + + internal_xml_file_read(xf, f) + end + + elseif read_as_template + # Binary and customXML files + # we only read these files to save the Excel file later + bytes = read(IOBuffer(ZipArchives.zip_readentry(xf.io, f, String))) + @assert sizeof(bytes) == ZipArchives.zip_uncompressed_size(xf.io, i) + xf.binary_data[f] = bytes + end + end + + check_minimum_requirements(xf) + parse_relationships!(xf) + parse_workbook!(xf) + + # read data from Worksheet streams + if read_files + for sheet_name in sheetnames(xf) + sheet = getsheet(xf, sheet_name) + + # to read sheet content, we just need to iterate a SheetRowIterator and the data will be stored in cache + for r in eachrow(sheet) + nothing + end + end + end + + if read_as_template + wb = get_workbook(xf) + if has_sst(wb) + sst_load!(wb) + end + end + + finally + if read_files # Always + close(xf) + else # Never + println("read_files isn't true!") + end + end + + return xf +end + +function get_default_namespace(r::EzXML.Node) :: String + nss = EzXML.namespaces(r) + # in case that only one namespace is defined, assume that it is the default one + # even if it has a prefix + length(nss) == 1 && return nss[1][2] + # otherwise, look for the default namespace (without prefix) + for (prefix, ns) in nss + if prefix == "" + return ns + end + end + + error("No default namespace found.") +end + +# See section 12.2 - Package Structure +function check_minimum_requirements(xf::XLSXFile) + mandatory_files = ["_rels/.rels", + "xl/workbook.xml", + "[Content_Types].xml", + "xl/_rels/workbook.xml.rels" + ] + + for f in mandatory_files + @assert in(f, filenames(xf)) "Malformed XLSX File. Couldn't find file $f in the package." + end + + nothing +end + +# Parses package level relationships defined in `_rels/.rels`. +# Parses workbook level relationships defined in `xl/_rels/workbook.xml.rels`. +function parse_relationships!(xf::XLSXFile) + + # package level relationships + xroot = get_package_relationship_root(xf) + for el in EzXML.eachelement(xroot) + push!(xf.relationships, Relationship(el)) + end + @assert !isempty(xf.relationships) "Relationships not found in _rels/.rels!" + + # workbook level relationships + wb = get_workbook(xf) + xroot = get_workbook_relationship_root(xf) + for el in EzXML.eachelement(xroot) + push!(wb.relationships, Relationship(el)) + end + @assert !isempty(wb.relationships) "Relationships not found in xl/_rels/workbook.xml.rels" + + nothing +end + +# Updates xf.workbook from xf.data[\"xl/workbook.xml\"] +function parse_workbook!(xf::XLSXFile) + xroot = xmlroot(xf, "xl/workbook.xml") + @assert EzXML.nodename(xroot) == "workbook" "Malformed xl/workbook.xml. Root node name should be 'workbook'. Got '$(EzXML.nodename(xroot))'." + + # workbook to be parsed + workbook = get_workbook(xf) + + # workbookPr -> date1904 + # does not have attribute => is not date1904 + workbook.date1904 = false + + # changes workbook.date1904 if there is a setting in the workbookPr node + for node in EzXML.eachelement(xroot) + if EzXML.nodename(node) == "workbookPr" + + # read date1904 attribute + if haskey(node, "date1904") + attribute_value_date1904 = node["date1904"] + + if attribute_value_date1904 == "1" || attribute_value_date1904 == "true" + workbook.date1904 = true + elseif attribute_value_date1904 == "0" || attribute_value_date1904 == "false" + workbook.date1904 = false + else + error("Could not parse xl/workbook -> workbookPr -> date1904 = $(attribute_value_date1904).") + end + end + + break + end + end + + # sheets + sheets = Vector{Worksheet}() + for node in EzXML.eachelement(xroot) + if EzXML.nodename(node) == "sheets" + + for sheet_node in EzXML.eachelement(node) + @assert EzXML.nodename(sheet_node) == "sheet" "Unsupported node $(EzXML.nodename(sheet_node)) in 'xl/workbook.xml'." + worksheet = Worksheet(xf, sheet_node) + push!(sheets, worksheet) + end + + break + end + end + workbook.sheets = sheets + + # named ranges + for node in EzXML.eachelement(xroot) + if EzXML.nodename(node) == "definedNames" + for defined_name_node in EzXML.eachelement(node) + @assert EzXML.nodename(defined_name_node) == "definedName" + defined_value_string = EzXML.nodecontent(defined_name_node) + name = defined_name_node["name"] + + local defined_value::DefinedNameValueTypes + + if is_valid_fixed_sheet_cellname(defined_value_string) || is_valid_sheet_cellname(defined_value_string) + defined_value = SheetCellRef(defined_value_string) + elseif is_valid_fixed_sheet_cellrange(defined_value_string) || is_valid_sheet_cellrange(defined_value_string) + defined_value = SheetCellRange(defined_value_string) + elseif occursin(r"^\".*\"$", defined_value_string) # is enclosed by quotes + defined_value = defined_value_string[2:end-1] # remove enclosing quotes + if isempty(defined_value) + defined_value = missing + end + elseif tryparse(Int, defined_value_string) != nothing + defined_value = parse(Int, defined_value_string) + elseif tryparse(Float64, defined_value_string) != nothing + defined_value = parse(Float64, defined_value_string) + elseif isempty(defined_value_string) + defined_value = missing + else + + # Couldn't parse definedName. Will silently ignore it, since this is not a critical feature. + continue + + # debug + #error("Could not parse value $(defined_value_string) for definedName $name.") + end + + if haskey(defined_name_node, "localSheetId") + # is a Worksheet level name + + # localSheetId is the 0-based index of the Worksheet in the order + # that it is displayed on screen. + # Which is the order of the elements under element in workbook.xml . + localSheetId = parse(Int, defined_name_node["localSheetId"]) + 1 + sheetId = workbook.sheets[localSheetId].sheetId + workbook.worksheet_names[(sheetId, name)] = defined_value + else + # is a Workbook level name + workbook.workbook_names[name] = defined_value + end + end + + break + end + end + + nothing +end + +# Lazy loading of XML files + +# Lists internal files from the XLSX package. +@inline filenames(xl::XLSXFile) = keys(xl.files) + +# Returns true if the file data was read into xl.data. +@inline internal_xml_file_isread(xl::XLSXFile, filename::String) :: Bool = xl.files[filename] +@inline internal_xml_file_exists(xl::XLSXFile, filename::String) :: Bool = haskey(xl.files, filename) + +function internal_xml_file_add!(xl::XLSXFile, filename::String) + @assert endswith(filename, ".xml") || endswith(filename, ".rels") + xl.files[filename] = false + nothing +end + +function internal_xml_file_read(xf::XLSXFile, filename::String) :: EzXML.Document + @assert internal_xml_file_exists(xf, filename) "Couldn't find $filename in $(xf.source)." + + if !internal_xml_file_isread(xf, filename) + @assert isopen(xf) "Can't read from a closed XLSXFile." + file_not_found = true + for f in ZipArchives.zip_names(xf.io) + if f == filename + xf.files[filename] = true # set file as read + + try + xf.data[filename] = EzXML.parsexml(ZipArchives.zip_readentry(xf.io, f, String)) + catch err + @error("Failed to parse internal XML file `$filename`") + rethrow() + end + + file_not_found = false + break + end + end + + if file_not_found + # shouldn't happen + error("$filename not found in XLSX package.") + end + end + + return xf.data[filename] +end + +function Base.close(xl::XLSXFile) + xl.io_is_open = false +# close(xl.io) + # close all internal file streams from worksheet caches +# for sheet in xl.workbook.sheets +# if sheet.cache != nothing && sheet.cache.stream_state != nothing +# close(sheet.cache.stream_state) +# end +# end +end + +Base.isopen(xl::XLSXFile) = xl.io_is_open + +# Utility method to find the XMLDocument associated with a given package filename. +# Returns xl.data[filename] if it exists. Throws an error if it doesn't. +@inline xmldocument(xl::XLSXFile, filename::String) :: EzXML.Document = internal_xml_file_read(xl, filename) + +# Utility method to return the root element of a given XMLDocument from the package. +# Returns EzXML.root(xl.data[filename]) if it exists. +@inline xmlroot(xl::XLSXFile, filename::String) :: EzXML.Node = EzXML.root(xmldocument(xl, filename)) + +# +# Helper Functions +# + +""" + readdata(source, sheet, ref) + readdata(source, sheetref) + +Returns a scalar or matrix with values from a spreadsheet. + +See also [`XLSX.getdata`](@ref). + +# Examples + +These function calls are equivalent. + +```julia +julia> XLSX.readdata("myfile.xlsx", "mysheet", "A2:B4") +3×2 Array{Any,2}: + 1 "first" + 2 "second" + 3 "third" + +julia> XLSX.readdata("myfile.xlsx", 1, "A2:B4") +3×2 Array{Any,2}: + 1 "first" + 2 "second" + 3 "third" + +julia> XLSX.readdata("myfile.xlsx", "mysheet!A2:B4") +3×2 Array{Any,2}: + 1 "first" + 2 "second" + 3 "third" +``` +""" +function readdata(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, ref) + c = openxlsx(source, enable_cache=true) do xf + getdata(getsheet(xf, sheet), ref) + end + return c +end +function readdata(source::Union{AbstractString, IO}, sheetref::AbstractString) + c = openxlsx(source, enable_cache=true) do xf + getdata(xf, sheetref) + end + return c +end + +""" + readtable( + source, + sheet, + [columns]; + [first_row], + [column_labels], + [header], + [infer_eltypes], + [stop_in_empty_row], + [stop_in_row_function], + [keep_empty_rows] + ) -> DataTable + +Returns tabular data from a spreadsheet as a struct `XLSX.DataTable`. +Use this function to create a `DataFrame` from package `DataFrames.jl`. + +Use `columns` argument to specify which columns to get. +For example, `"B:D"` will select columns `B`, `C` and `D`. +If `columns` is not given, the algorithm will find the first sequence +of consecutive non-empty cells. + +Use `first_row` to indicate the first row from the table. +`first_row=5` will look for a table starting at sheet row `5`. +If `first_row` is not given, the algorithm will look for the first +non-empty row in the spreadsheet. + +`header` is a `Bool` indicating if the first row is a header. +If `header=true` and `column_labels` is not specified, the column labels +for the table will be read from the first row of the table. +If `header=false` and `column_labels` is not specified, the algorithm +will generate column labels. The default value is `header=true`. + +Use `column_labels` to specify names for the header of the table. + +Use `infer_eltypes=true` to get `data` as a `Vector{Any}` of typed vectors. +The default value is `infer_eltypes=false`. + +`stop_in_empty_row` is a boolean indicating whether an empty row marks the end of the table. +If `stop_in_empty_row=false`, the `TableRowIterator` will continue to fetch rows until there's no more rows in the Worksheet. +The default behavior is `stop_in_empty_row=true`. + +`stop_in_row_function` is a Function that receives a `TableRow` and returns a `Bool` indicating if the end of the table was reached. + +Example for `stop_in_row_function`: + +``` +function stop_function(r) + v = r[:col_label] + return !ismissing(v) && v == "unwanted value" +end +``` + +`keep_empty_rows` determines whether rows where all column values are equal to `missing` are kept (`true`) or dropped (`false`) from the resulting table. +`keep_empty_rows` never affects the *bounds* of the table; the number of rows read from a sheet is only affected by, `first_row`, `stop_in_empty_row` and `stop_in_row_function` (if specified). +`keep_empty_rows` is only checked once the first and last row of the table have been determined, to see whether to keep or drop empty rows between the first and the last row. + +# Example + +```julia +julia> using DataFrames, XLSX + +julia> df = DataFrame(XLSX.readtable("myfile.xlsx", "mysheet")) +``` + +See also: [`XLSX.gettable`](@ref). +""" +function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false, keep_empty_rows::Bool=false) + c = openxlsx(source, enable_cache=enable_cache) do xf + gettable(getsheet(xf, sheet); first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows) + end + return c +end + +function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, columns::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false, keep_empty_rows::Bool=false) + c = openxlsx(source, enable_cache=enable_cache) do xf + gettable(getsheet(xf, sheet), columns; first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows) + end + return c +end + +@inline get_xlsxfile(wb::Workbook) :: XLSXFile = wb.package +@inline get_xlsxfile(ws::Worksheet) :: XLSXFile = ws.package +@inline get_workbook(ws::Worksheet) :: Workbook = get_xlsxfile(ws).workbook +@inline get_workbook(xl::XLSXFile) :: Workbook = xl.workbook + +const ZIP_FILE_HEADER = [ 0x50, 0x4b, 0x03, 0x04 ] +const XLS_FILE_HEADER = [ 0xd0, 0xcf, 0x11, 0xe0 ] + +function check_for_xlsx_file_format(source::IO, label::AbstractString="input") + local header::Vector{UInt8} + + mark(source) + header = Base.read(source, 4) + reset(source) + + if header == ZIP_FILE_HEADER # valid Zip file header + return + elseif header == XLS_FILE_HEADER # old XLS file + error("$label looks like an old XLS file (not XLSX). This package does not support XLS file format.") + else + error("$label is not a valid XLSX file.") + end +end + +function check_for_xlsx_file_format(filepath::AbstractString) + @assert isfile(filepath) "File $filepath not found." + + open(filepath, "r") do io + check_for_xlsx_file_format(io, filepath) + end +end + +""" + readxlsx(source::Union{AbstractString, IO}) :: XLSXFile + +Main function for reading an Excel file. +This function will read the whole Excel file into memory +and return a closed XLSXFile. + +Consider using [`XLSX.openxlsx`](@ref) for lazy loading of Excel file contents. +""" +@inline readxlsx(source::Union{AbstractString, IO}) :: XLSXFile = open_or_read_xlsx(source, true, true, false) + +""" + openxlsx(f::F, source::Union{AbstractString, IO}; mode::AbstractString="r", enable_cache::Bool=true) where {F<:Function} + +Open XLSX file for reading and/or writing. It returns an opened XLSXFile that will be automatically closed after applying `f` to the file. + +# `Do` syntax + +This function should be used with `do` syntax, like in: + +```julia +XLSX.openxlsx("myfile.xlsx") do xf + # read data from `xf` +end +``` + +# Filemodes + +The `mode` argument controls how the file is opened. The following modes are allowed: + +* `r` : read mode. The existing data in `source` will be accessible for reading. This is the **default** mode. + +* `w` : write mode. Opens an empty file that will be written to `source`. + +* `rw` : edit mode. Opens `source` for editing. The file will be saved to disk when the function ends. + +!!! warning + + The `rw` mode is known to produce some data loss. See [#159](/~https://github.com/felipenoris/XLSX.jl/issues/159). + + Simple data should work fine. Users are advised to use this feature with caution when working with formulas and charts. + +# Arguments + +* `source` is IO or the complete path to the file. + +* `mode` is the file mode, as explained in the last section. + +* `enable_cache`: + +If `enable_cache=true`, all read worksheet cells will be cached. +If you read a worksheet cell twice it will use the cached value instead of reading from disk +in the second time. + +If `enable_cache=false`, worksheet cells will always be read from disk. +This is useful when you want to read a spreadsheet that doesn't fit into memory. + +The default value is `enable_cache=true`. + +# Examples + +## Read from file + +The following example shows how you would read worksheet cells, one row at a time, +where `myfile.xlsx` is a spreadsheet that doesn't fit into memory. + +```julia +julia> XLSX.openxlsx("myfile.xlsx", enable_cache=false) do xf + for r in XLSX.eachrow(xf["mysheet"]) + # read something from row `r` + end + end +``` + +## Write a new file + +```julia +XLSX.openxlsx("new.xlsx", mode="w") do xf + sheet = xf[1] + sheet[1, :] = [1, Date(2018, 1, 1), "test"] +end +``` + +## Edit an existing file + +```julia +XLSX.openxlsx("edit.xlsx", mode="rw") do xf + sheet = xf[1] + sheet[2, :] = [2, Date(2019, 1, 1), "add new line"] +end +``` + See also [`XLSX.readxlsx`](@ref). """ function openxlsx(f::F, source::Union{AbstractString, IO}; From 14a30c6fb52d707774f4ccb1522b9fbcf97a56af Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:42:00 +0100 Subject: [PATCH 11/18] Remove ZipFile --- src/stream.jl | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/stream.jl b/src/stream.jl index 09eedae..b603fee 100644 --- a/src/stream.jl +++ b/src/stream.jl @@ -47,14 +47,13 @@ end Base.show(io::IO, state::SheetRowStreamIteratorState) = print(io, "SheetRowStreamIteratorState( is open = $(state.is_open) , row = $(state.row) )") # Opens a file for streaming. -@inline function open_internal_file_stream(xf::XLSXFile, filename::String) :: Tuple{ZipFile.Reader, EzXML.StreamReader} +@inline function open_internal_file_stream(xf::XLSXFile, filename::String) :: Tuple{ZipArchives.ZipReader, EzXML.StreamReader} @assert internal_xml_file_exists(xf, filename) "Couldn't find $filename in $(xf.source)." @assert xf.source isa IO || isfile(xf.source) "Can't open internal file $filename for streaming because the XLSX file $(xf.filepath) was not found." - io = ZipFile.Reader(xf.source) - for f in io.files - if f.name == filename - return io, EzXML.StreamReader(f) + for f in ZipArchives.zip_names(xf.io) + if f == filename + return xf.io, EzXML.StreamReader(IOBuffer(ZipArchives.zip_readentry(xf.io, f, String))) end end @@ -67,7 +66,7 @@ end if isopen(s) s.is_open = false close(s.xml_stream_reader) - close(s.zip_io) +# close(s.zip_io) end nothing end @@ -102,7 +101,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo elseif is_end_of_sheet_data(reader) # this Worksheet has no rows close(reader) - close(zip_io) +# close(zip_io) return nothing end end @@ -117,7 +116,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo reader = state.xml_stream_reader if is_end_of_sheet_data(reader) - @assert !isopen(state) +# @assert !isopen(state) return nothing else @assert isopen(state) "Error processing Worksheet $(ws.name): Can't fetch rows from a closed workbook." @@ -133,7 +132,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo while EzXML.iterate(reader) != nothing if is_end_of_sheet_data(reader) - close(state) +# close(state) break end @@ -142,7 +141,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo while true if is_end_of_sheet_data(reader) - close(state) +# close(state) break elseif EzXML.nodetype(reader) == EzXML.READER_ELEMENT && nodename(reader) == "row" break From f54349a68b390da2df27a6086fad9d418706204e Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:43:58 +0100 Subject: [PATCH 12/18] Remove ZipFile and gc call on read --- src/read.jl | 615 ---------------------------------------------------- 1 file changed, 615 deletions(-) diff --git a/src/read.jl b/src/read.jl index 50de817..e66fc25 100644 --- a/src/read.jl +++ b/src/read.jl @@ -615,618 +615,3 @@ function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractStrin end return c end - -@inline get_xlsxfile(wb::Workbook) :: XLSXFile = wb.package -@inline get_xlsxfile(ws::Worksheet) :: XLSXFile = ws.package -@inline get_workbook(ws::Worksheet) :: Workbook = get_xlsxfile(ws).workbook -@inline get_workbook(xl::XLSXFile) :: Workbook = xl.workbook - -const ZIP_FILE_HEADER = [ 0x50, 0x4b, 0x03, 0x04 ] -const XLS_FILE_HEADER = [ 0xd0, 0xcf, 0x11, 0xe0 ] - -function check_for_xlsx_file_format(source::IO, label::AbstractString="input") - local header::Vector{UInt8} - - mark(source) - header = Base.read(source, 4) - reset(source) - - if header == ZIP_FILE_HEADER # valid Zip file header - return - elseif header == XLS_FILE_HEADER # old XLS file - error("$label looks like an old XLS file (not XLSX). This package does not support XLS file format.") - else - error("$label is not a valid XLSX file.") - end -end - -function check_for_xlsx_file_format(filepath::AbstractString) - @assert isfile(filepath) "File $filepath not found." - - open(filepath, "r") do io - check_for_xlsx_file_format(io, filepath) - end -end - -""" - readxlsx(source::Union{AbstractString, IO}) :: XLSXFile - -Main function for reading an Excel file. -This function will read the whole Excel file into memory -and return a closed XLSXFile. - -Consider using [`XLSX.openxlsx`](@ref) for lazy loading of Excel file contents. -""" -@inline readxlsx(source::Union{AbstractString, IO}) :: XLSXFile = open_or_read_xlsx(source, true, true, false) - -""" - openxlsx(f::F, source::Union{AbstractString, IO}; mode::AbstractString="r", enable_cache::Bool=true) where {F<:Function} - -Open XLSX file for reading and/or writing. It returns an opened XLSXFile that will be automatically closed after applying `f` to the file. - -# `Do` syntax - -This function should be used with `do` syntax, like in: - -```julia -XLSX.openxlsx("myfile.xlsx") do xf - # read data from `xf` -end -``` - -# Filemodes - -The `mode` argument controls how the file is opened. The following modes are allowed: - -* `r` : read mode. The existing data in `source` will be accessible for reading. This is the **default** mode. - -* `w` : write mode. Opens an empty file that will be written to `source`. - -* `rw` : edit mode. Opens `source` for editing. The file will be saved to disk when the function ends. - -!!! warning - - The `rw` mode is known to produce some data loss. See [#159](/~https://github.com/felipenoris/XLSX.jl/issues/159). - - Simple data should work fine. Users are advised to use this feature with caution when working with formulas and charts. - -# Arguments - -* `source` is IO or the complete path to the file. - -* `mode` is the file mode, as explained in the last section. - -* `enable_cache`: - -If `enable_cache=true`, all read worksheet cells will be cached. -If you read a worksheet cell twice it will use the cached value instead of reading from disk -in the second time. - -If `enable_cache=false`, worksheet cells will always be read from disk. -This is useful when you want to read a spreadsheet that doesn't fit into memory. - -The default value is `enable_cache=true`. - -# Examples - -## Read from file - -The following example shows how you would read worksheet cells, one row at a time, -where `myfile.xlsx` is a spreadsheet that doesn't fit into memory. - -```julia -julia> XLSX.openxlsx("myfile.xlsx", enable_cache=false) do xf - for r in XLSX.eachrow(xf["mysheet"]) - # read something from row `r` - end - end -``` - -## Write a new file - -```julia -XLSX.openxlsx("new.xlsx", mode="w") do xf - sheet = xf[1] - sheet[1, :] = [1, Date(2018, 1, 1), "test"] -end -``` - -## Edit an existing file - -```julia -XLSX.openxlsx("edit.xlsx", mode="rw") do xf - sheet = xf[1] - sheet[2, :] = [2, Date(2019, 1, 1), "add new line"] -end -``` - -See also [`XLSX.readxlsx`](@ref). -""" -function openxlsx(f::F, source::Union{AbstractString, IO}; - mode::AbstractString="r", enable_cache::Bool=true) where {F<:Function} - - _read, _write = parse_file_mode(mode) - - if _read - @assert source isa IO || isfile(source) "File $source not found." - xf = open_or_read_xlsx(source, _read, enable_cache, _write) - else - xf = open_empty_template() - end - - try - f(xf) - finally - - if _write - writexlsx(source, xf, overwrite=true) - else - close(xf) - end - - end -end - -""" - openxlsx(source::Union{AbstractString, IO}; mode="r", enable_cache=true) :: XLSXFile - -Supports opening a XLSX file without using do-syntax. -In this case, the user is responsible for closing the `XLSXFile` -using `close` or writing it to file using `XLSX.writexlsx`. - -See also [`XLSX.writexlsx`](@ref). -""" -function openxlsx(source::Union{AbstractString, IO}; - mode::AbstractString="r", - enable_cache::Bool=true) :: XLSXFile - - _read, _write = parse_file_mode(mode) - - if _read - @assert source isa IO || isfile(source) "File $source not found." - return open_or_read_xlsx(source, _read, enable_cache, _write) - else - return open_empty_template() - end -end - -function parse_file_mode(mode::AbstractString) :: Tuple{Bool, Bool} - if mode == "r" - return (true, false) - elseif mode == "w" - return (false, true) - elseif mode == "rw" || mode == "wr" - return (true, true) - else - error("Couldn't parse file mode $mode.") - end -end - -function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, enable_cache::Bool, read_as_template::Bool) :: XLSXFile - # sanity check - if read_as_template - @assert read_files && enable_cache - end - - xf = XLSXFile(source, enable_cache, read_as_template) - - try - for (i, f) in enumerate(ZipArchives.zip_names(xf.io)) - - # ignore xl/calcChain.xml in any case (#31) - if f == "xl/calcChain.xml" - continue - end - - # let customXML files get passed through to write like binaries are (below). - if !startswith(f, "customXml") && (endswith(f, ".xml") || endswith(f, ".rels")) - - # XML file - internal_xml_file_add!(xf, f) - if read_files - - # ignore worksheet files because they'll be read thru streaming - # If reading as template, it will be loaded in two places: here and WorksheetCache. - if !read_as_template && startswith(f, "xl/worksheets") && endswith(f, ".xml") - continue - end - - internal_xml_file_read(xf, f) - end - - elseif read_as_template - # Binary and customXML files - # we only read these files to save the Excel file later - bytes = read(IOBuffer(ZipArchives.zip_readentry(xf.io, f, String))) - @assert sizeof(bytes) == ZipArchives.zip_uncompressed_size(xf.io, i) - xf.binary_data[f] = bytes - end - end - - check_minimum_requirements(xf) - parse_relationships!(xf) - parse_workbook!(xf) - - # read data from Worksheet streams - if read_files - for sheet_name in sheetnames(xf) - sheet = getsheet(xf, sheet_name) - - # to read sheet content, we just need to iterate a SheetRowIterator and the data will be stored in cache - for r in eachrow(sheet) - nothing - end - end - end - - if read_as_template - wb = get_workbook(xf) - if has_sst(wb) - sst_load!(wb) - end - end - - finally - if read_files # Always - close(xf) - else # Never - println("read_files isn't true!") - end - end - - return xf -end - -function get_default_namespace(r::EzXML.Node) :: String - nss = EzXML.namespaces(r) - # in case that only one namespace is defined, assume that it is the default one - # even if it has a prefix - length(nss) == 1 && return nss[1][2] - # otherwise, look for the default namespace (without prefix) - for (prefix, ns) in nss - if prefix == "" - return ns - end - end - - error("No default namespace found.") -end - -# See section 12.2 - Package Structure -function check_minimum_requirements(xf::XLSXFile) - mandatory_files = ["_rels/.rels", - "xl/workbook.xml", - "[Content_Types].xml", - "xl/_rels/workbook.xml.rels" - ] - - for f in mandatory_files - @assert in(f, filenames(xf)) "Malformed XLSX File. Couldn't find file $f in the package." - end - - nothing -end - -# Parses package level relationships defined in `_rels/.rels`. -# Parses workbook level relationships defined in `xl/_rels/workbook.xml.rels`. -function parse_relationships!(xf::XLSXFile) - - # package level relationships - xroot = get_package_relationship_root(xf) - for el in EzXML.eachelement(xroot) - push!(xf.relationships, Relationship(el)) - end - @assert !isempty(xf.relationships) "Relationships not found in _rels/.rels!" - - # workbook level relationships - wb = get_workbook(xf) - xroot = get_workbook_relationship_root(xf) - for el in EzXML.eachelement(xroot) - push!(wb.relationships, Relationship(el)) - end - @assert !isempty(wb.relationships) "Relationships not found in xl/_rels/workbook.xml.rels" - - nothing -end - -# Updates xf.workbook from xf.data[\"xl/workbook.xml\"] -function parse_workbook!(xf::XLSXFile) - xroot = xmlroot(xf, "xl/workbook.xml") - @assert EzXML.nodename(xroot) == "workbook" "Malformed xl/workbook.xml. Root node name should be 'workbook'. Got '$(EzXML.nodename(xroot))'." - - # workbook to be parsed - workbook = get_workbook(xf) - - # workbookPr -> date1904 - # does not have attribute => is not date1904 - workbook.date1904 = false - - # changes workbook.date1904 if there is a setting in the workbookPr node - for node in EzXML.eachelement(xroot) - if EzXML.nodename(node) == "workbookPr" - - # read date1904 attribute - if haskey(node, "date1904") - attribute_value_date1904 = node["date1904"] - - if attribute_value_date1904 == "1" || attribute_value_date1904 == "true" - workbook.date1904 = true - elseif attribute_value_date1904 == "0" || attribute_value_date1904 == "false" - workbook.date1904 = false - else - error("Could not parse xl/workbook -> workbookPr -> date1904 = $(attribute_value_date1904).") - end - end - - break - end - end - - # sheets - sheets = Vector{Worksheet}() - for node in EzXML.eachelement(xroot) - if EzXML.nodename(node) == "sheets" - - for sheet_node in EzXML.eachelement(node) - @assert EzXML.nodename(sheet_node) == "sheet" "Unsupported node $(EzXML.nodename(sheet_node)) in 'xl/workbook.xml'." - worksheet = Worksheet(xf, sheet_node) - push!(sheets, worksheet) - end - - break - end - end - workbook.sheets = sheets - - # named ranges - for node in EzXML.eachelement(xroot) - if EzXML.nodename(node) == "definedNames" - for defined_name_node in EzXML.eachelement(node) - @assert EzXML.nodename(defined_name_node) == "definedName" - defined_value_string = EzXML.nodecontent(defined_name_node) - name = defined_name_node["name"] - - local defined_value::DefinedNameValueTypes - - if is_valid_fixed_sheet_cellname(defined_value_string) || is_valid_sheet_cellname(defined_value_string) - defined_value = SheetCellRef(defined_value_string) - elseif is_valid_fixed_sheet_cellrange(defined_value_string) || is_valid_sheet_cellrange(defined_value_string) - defined_value = SheetCellRange(defined_value_string) - elseif occursin(r"^\".*\"$", defined_value_string) # is enclosed by quotes - defined_value = defined_value_string[2:end-1] # remove enclosing quotes - if isempty(defined_value) - defined_value = missing - end - elseif tryparse(Int, defined_value_string) != nothing - defined_value = parse(Int, defined_value_string) - elseif tryparse(Float64, defined_value_string) != nothing - defined_value = parse(Float64, defined_value_string) - elseif isempty(defined_value_string) - defined_value = missing - else - - # Couldn't parse definedName. Will silently ignore it, since this is not a critical feature. - continue - - # debug - #error("Could not parse value $(defined_value_string) for definedName $name.") - end - - if haskey(defined_name_node, "localSheetId") - # is a Worksheet level name - - # localSheetId is the 0-based index of the Worksheet in the order - # that it is displayed on screen. - # Which is the order of the elements under element in workbook.xml . - localSheetId = parse(Int, defined_name_node["localSheetId"]) + 1 - sheetId = workbook.sheets[localSheetId].sheetId - workbook.worksheet_names[(sheetId, name)] = defined_value - else - # is a Workbook level name - workbook.workbook_names[name] = defined_value - end - end - - break - end - end - - nothing -end - -# Lazy loading of XML files - -# Lists internal files from the XLSX package. -@inline filenames(xl::XLSXFile) = keys(xl.files) - -# Returns true if the file data was read into xl.data. -@inline internal_xml_file_isread(xl::XLSXFile, filename::String) :: Bool = xl.files[filename] -@inline internal_xml_file_exists(xl::XLSXFile, filename::String) :: Bool = haskey(xl.files, filename) - -function internal_xml_file_add!(xl::XLSXFile, filename::String) - @assert endswith(filename, ".xml") || endswith(filename, ".rels") - xl.files[filename] = false - nothing -end - -function internal_xml_file_read(xf::XLSXFile, filename::String) :: EzXML.Document - @assert internal_xml_file_exists(xf, filename) "Couldn't find $filename in $(xf.source)." - - if !internal_xml_file_isread(xf, filename) - @assert isopen(xf) "Can't read from a closed XLSXFile." - file_not_found = true - for f in ZipArchives.zip_names(xf.io) - if f == filename - xf.files[filename] = true # set file as read - - try - xf.data[filename] = EzXML.parsexml(ZipArchives.zip_readentry(xf.io, f, String)) - catch err - @error("Failed to parse internal XML file `$filename`") - rethrow() - end - - file_not_found = false - break - end - end - - if file_not_found - # shouldn't happen - error("$filename not found in XLSX package.") - end - end - - return xf.data[filename] -end - -function Base.close(xl::XLSXFile) - xl.io_is_open = false -# close(xl.io) - # close all internal file streams from worksheet caches - for sheet in xl.workbook.sheets - if sheet.cache != nothing && sheet.cache.stream_state != nothing - close(sheet.cache.stream_state) - end - end -end - -Base.isopen(xl::XLSXFile) = xl.io_is_open - -# Utility method to find the XMLDocument associated with a given package filename. -# Returns xl.data[filename] if it exists. Throws an error if it doesn't. -@inline xmldocument(xl::XLSXFile, filename::String) :: EzXML.Document = internal_xml_file_read(xl, filename) - -# Utility method to return the root element of a given XMLDocument from the package. -# Returns EzXML.root(xl.data[filename]) if it exists. -@inline xmlroot(xl::XLSXFile, filename::String) :: EzXML.Node = EzXML.root(xmldocument(xl, filename)) - -# -# Helper Functions -# - -""" - readdata(source, sheet, ref) - readdata(source, sheetref) - -Returns a scalar or matrix with values from a spreadsheet. - -See also [`XLSX.getdata`](@ref). - -# Examples - -These function calls are equivalent. - -```julia -julia> XLSX.readdata("myfile.xlsx", "mysheet", "A2:B4") -3×2 Array{Any,2}: - 1 "first" - 2 "second" - 3 "third" - -julia> XLSX.readdata("myfile.xlsx", 1, "A2:B4") -3×2 Array{Any,2}: - 1 "first" - 2 "second" - 3 "third" - -julia> XLSX.readdata("myfile.xlsx", "mysheet!A2:B4") -3×2 Array{Any,2}: - 1 "first" - 2 "second" - 3 "third" -``` -""" -function readdata(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, ref) - c = openxlsx(source, enable_cache=true) do xf - getdata(getsheet(xf, sheet), ref) - end - return c -end -function readdata(source::Union{AbstractString, IO}, sheetref::AbstractString) - c = openxlsx(source, enable_cache=true) do xf - getdata(xf, sheetref) - end - return c -end - -""" - readtable( - source, - sheet, - [columns]; - [first_row], - [column_labels], - [header], - [infer_eltypes], - [stop_in_empty_row], - [stop_in_row_function], - [keep_empty_rows] - ) -> DataTable - -Returns tabular data from a spreadsheet as a struct `XLSX.DataTable`. -Use this function to create a `DataFrame` from package `DataFrames.jl`. - -Use `columns` argument to specify which columns to get. -For example, `"B:D"` will select columns `B`, `C` and `D`. -If `columns` is not given, the algorithm will find the first sequence -of consecutive non-empty cells. - -Use `first_row` to indicate the first row from the table. -`first_row=5` will look for a table starting at sheet row `5`. -If `first_row` is not given, the algorithm will look for the first -non-empty row in the spreadsheet. - -`header` is a `Bool` indicating if the first row is a header. -If `header=true` and `column_labels` is not specified, the column labels -for the table will be read from the first row of the table. -If `header=false` and `column_labels` is not specified, the algorithm -will generate column labels. The default value is `header=true`. - -Use `column_labels` to specify names for the header of the table. - -Use `infer_eltypes=true` to get `data` as a `Vector{Any}` of typed vectors. -The default value is `infer_eltypes=false`. - -`stop_in_empty_row` is a boolean indicating whether an empty row marks the end of the table. -If `stop_in_empty_row=false`, the `TableRowIterator` will continue to fetch rows until there's no more rows in the Worksheet. -The default behavior is `stop_in_empty_row=true`. - -`stop_in_row_function` is a Function that receives a `TableRow` and returns a `Bool` indicating if the end of the table was reached. - -Example for `stop_in_row_function`: - -``` -function stop_function(r) - v = r[:col_label] - return !ismissing(v) && v == "unwanted value" -end -``` - -`keep_empty_rows` determines whether rows where all column values are equal to `missing` are kept (`true`) or dropped (`false`) from the resulting table. -`keep_empty_rows` never affects the *bounds* of the table; the number of rows read from a sheet is only affected by, `first_row`, `stop_in_empty_row` and `stop_in_row_function` (if specified). -`keep_empty_rows` is only checked once the first and last row of the table have been determined, to see whether to keep or drop empty rows between the first and the last row. - -# Example - -```julia -julia> using DataFrames, XLSX - -julia> df = DataFrame(XLSX.readtable("myfile.xlsx", "mysheet")) -``` - -See also: [`XLSX.gettable`](@ref). -""" -function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=true, keep_empty_rows::Bool=false) - c = openxlsx(source, enable_cache=enable_cache) do xf - gettable(getsheet(xf, sheet); first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows) - end - return c -end - -function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, columns::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=true, keep_empty_rows::Bool=false) - c = openxlsx(source, enable_cache=enable_cache) do xf - gettable(getsheet(xf, sheet), columns; first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows) - end - return c -end From acc6efb220dcf69a67ba356c95fa4c8f673160e8 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:46:18 +0100 Subject: [PATCH 13/18] Remove ZipFile --- src/worksheet.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/worksheet.jl b/src/worksheet.jl index c25b635..04c8bda 100644 --- a/src/worksheet.jl +++ b/src/worksheet.jl @@ -6,7 +6,6 @@ function Worksheet(xf::XLSXFile, sheet_element::EzXML.Node) name = sheet_element["name"] is_hidden = haskey(sheet_element, "state") && sheet_element["state"] in ["hidden", "veryHidden"] dim = read_worksheet_dimension(xf, relationship_id, name) - return Worksheet(xf, sheetId, relationship_id, name, dim, is_hidden) end @@ -26,7 +25,6 @@ end # 18.3.1.35 - dimension (Worksheet Dimensions). This is optional, and not required. function read_worksheet_dimension(xf::XLSXFile, relationship_id, name) :: Union{Nothing, CellRange} local result::Union{Nothing, CellRange} = nothing - wb = get_workbook(xf) target_file = get_relationship_target_by_id("xl", wb, relationship_id) zip_io, reader = open_internal_file_stream(xf, target_file) @@ -48,7 +46,7 @@ function read_worksheet_dimension(xf::XLSXFile, relationship_id, name) :: Union{ end finally close(reader) - close(zip_io) +# close(zip_io) end return result From d81e672179eea808e7d405be3336efc886bb14c2 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:50:13 +0100 Subject: [PATCH 14/18] Remove ZipFile From b28cc988a0a0025f68fa162ecc6b7b6d735be0f4 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:53:46 +0100 Subject: [PATCH 15/18] Simplify per suggestion from @nhz2 This should be faster because it avoids creating a String and there is a check of the uncompressed_size in /~https://github.com/JuliaIO/ZipArchives.jl/blob/f955785e237a0a8b3607cf651eaebc1eb1037b8c/src/reader.jl#L344 Co-authored-by: Nathan Zimmerberg <39104088+nhz2@users.noreply.github.com> --- src/read.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/read.jl b/src/read.jl index e66fc25..015bbbd 100644 --- a/src/read.jl +++ b/src/read.jl @@ -222,9 +222,7 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, elseif read_as_template # Binary and customXML files # we only read these files to save the Excel file later - bytes = read(IOBuffer(ZipArchives.zip_readentry(xf.io, f, String))) - @assert sizeof(bytes) == ZipArchives.zip_uncompressed_size(xf.io, i) - xf.binary_data[f] = bytes + xf.binary_data[f] = ZipArchives.zip_readentry(xf.io, f) end end From ffdc969980b9871f3f49e0c670871338a50ce9e0 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:54:56 +0100 Subject: [PATCH 16/18] Following suggestion from @nhz2 zip_openentry can be used here to avoid decompressing the entire entry into memory. Also, the error on the line after this can be removed with this change. Co-authored-by: Nathan Zimmerberg <39104088+nhz2@users.noreply.github.com> --- src/stream.jl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/stream.jl b/src/stream.jl index b603fee..7b740ea 100644 --- a/src/stream.jl +++ b/src/stream.jl @@ -51,11 +51,8 @@ Base.show(io::IO, state::SheetRowStreamIteratorState) = print(io, "SheetRowStrea @assert internal_xml_file_exists(xf, filename) "Couldn't find $filename in $(xf.source)." @assert xf.source isa IO || isfile(xf.source) "Can't open internal file $filename for streaming because the XLSX file $(xf.filepath) was not found." - for f in ZipArchives.zip_names(xf.io) - if f == filename - return xf.io, EzXML.StreamReader(IOBuffer(ZipArchives.zip_readentry(xf.io, f, String))) - end - end + f = ZipArchives.zip_openentry(xf.io, filename) + return xf.io, EzXML.StreamReader(f) error("Couldn't find $filename in $(xf.source).") end From 37eb14f9aade4338fd38960f1bef7aba0e15ba46 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:57:09 +0100 Subject: [PATCH 17/18] Following suggestion from @nhz2 This should be faster because it avoids allocating all of the entry names at once. Co-authored-by: Nathan Zimmerberg <39104088+nhz2@users.noreply.github.com> --- src/read.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/read.jl b/src/read.jl index 015bbbd..06d42e4 100644 --- a/src/read.jl +++ b/src/read.jl @@ -196,8 +196,8 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool, xf = XLSXFile(source, enable_cache, read_as_template) try - for (i, f) in enumerate(ZipArchives.zip_names(xf.io)) - + for i in 1:ZipArchives.zip_nentries(xf.io) + f = ZipArchives.zip_name(xf.io, i) # ignore xl/calcChain.xml in any case (#31) if f == "xl/calcChain.xml" continue From 7600fd5561156b362e8384801128456421ae13d7 Mon Sep 17 00:00:00 2001 From: TimG1964 <157401228+TimG1964@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:14:56 +0100 Subject: [PATCH 18/18] Following suggestion by @nhz2 --- src/read.jl | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/read.jl b/src/read.jl index 06d42e4..7c31229 100644 --- a/src/read.jl +++ b/src/read.jl @@ -436,28 +436,17 @@ function internal_xml_file_read(xf::XLSXFile, filename::String) :: EzXML.Documen @assert internal_xml_file_exists(xf, filename) "Couldn't find $filename in $(xf.source)." if !internal_xml_file_isread(xf, filename) + @assert isopen(xf) "Can't read from a closed XLSXFile." - file_not_found = true - for f in ZipArchives.zip_names(xf.io) - if f == filename - xf.files[filename] = true # set file as read - - try - xf.data[filename] = EzXML.parsexml(ZipArchives.zip_readentry(xf.io, f, String)) - catch err - @error("Failed to parse internal XML file `$filename`") - rethrow() - end - file_not_found = false - break - end + try + xf.data[filename] = EzXML.parsexml(ZipArchives.zip_readentry(xf.io, filename)) + xf.files[filename] = true # set file as read + catch err + @error("Failed to parse internal XML file `$filename`") + rethrow() end - if file_not_found - # shouldn't happen - error("$filename not found in XLSX package.") - end end return xf.data[filename]