Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove dependency on ZipFile and remove GC call from read #273

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,11 @@ EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"

[compat]
EzXML = "1"
Tables = "1"
ZipArchives = "2"
ZipFile = "0.8, 0.9, 0.10"
julia = "1.6"

[extras]
Expand Down
6 changes: 0 additions & 6 deletions src/XLSX.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,11 @@ module XLSX
import Artifacts
import Dates
import Printf.@printf
import ZipFile
import ZipArchives
import EzXML
import Tables
import Base.convert

# /~https://github.com/fhs/ZipFile.jl/issues/39
if !hasmethod(Base.bytesavailable, Tuple{ZipFile.ReadableFile})
Base.bytesavailable(f::ZipFile.ReadableFile) = f.uncompressedsize - f._pos
end

const SPREADSHEET_NAMESPACE_XPATH_ARG = [ "xpath" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main" ]
const EXCEL_MAX_COLS = 16_384 # total columns supported by Excel per sheet
const EXCEL_MAX_ROWS = 1_048_576 # total rows supported by Excel per sheet (including headers)
Expand Down
89 changes: 34 additions & 55 deletions src/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,13 @@ function openxlsx(f::F, source::Union{AbstractString, IO};

if _read
@assert source isa IO || isfile(source) "File $source not found."
xf = open_or_read_xlsx(source, _write, enable_cache, _write)

xf = open_or_read_xlsx(source, _read, enable_cache, _write)
else
xf = open_empty_template()
end


try
f(xf)
finally
Expand All @@ -147,8 +149,6 @@ function openxlsx(f::F, source::Union{AbstractString, IO};
close(xf)
end

# fix libuv issue on windows (#42) and other systems (#173)
GC.gc()
end
end

Expand All @@ -169,7 +169,7 @@ function openxlsx(source::Union{AbstractString, IO};

if _read
@assert source isa IO || isfile(source) "File $source not found."
return open_or_read_xlsx(source, _write, enable_cache, _write)
return open_or_read_xlsx(source, _read, enable_cache, _write)
else
return open_empty_template()
end
Expand All @@ -196,43 +196,33 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool,
xf = XLSXFile(source, enable_cache, read_as_template)

try
for f in xf.io.files

for i in 1:ZipArchives.zip_nentries(xf.io)
f = ZipArchives.zip_name(xf.io, i)
# ignore xl/calcChain.xml in any case (#31)
if f.name == "xl/calcChain.xml"
if f == "xl/calcChain.xml"
continue
end

# Rather than ignore custom XML internal files here, let them get passed through to write like binaries are.
if !startswith(f.name, "customXml") && (endswith(f.name, ".xml") || endswith(f.name, ".rels"))
#if endswith(f.name, ".xml") || endswith(f.name, ".rels")
# let customXML files get passed through to write like binaries are (below).
if !startswith(f, "customXml") && (endswith(f, ".xml") || endswith(f, ".rels"))

# XML file
internal_xml_file_add!(xf, f.name)
internal_xml_file_add!(xf, f)
if read_files

# ignore worksheet files because they'll be read thru streaming
# If reading as template, it will be loaded in two places: here and WorksheetCache.
if !read_as_template && startswith(f.name, "xl/worksheets") && endswith(f.name, ".xml")
if !read_as_template && startswith(f, "xl/worksheets") && endswith(f, ".xml")
continue
end

# ignore custom XML internal files
# no longer needed if these files are passed through like binary files
#if startswith(f.name, "customXml")
# continue
#end

internal_xml_file_read(xf, f.name)
internal_xml_file_read(xf, f)
end
elseif read_as_template

# Binary file
# we only read binary files to save the Excel file later
# Custom XML files also now get passed through this way, too
bytes = ZipFile.read(f)
@assert sizeof(bytes) == f.uncompressedsize
xf.binary_data[f.name] = bytes
elseif read_as_template
# Binary and customXML files
# we only read these files to save the Excel file later
xf.binary_data[f] = ZipArchives.zip_readentry(xf.io, f)
end
end

Expand Down Expand Up @@ -260,8 +250,10 @@ function open_or_read_xlsx(source::Union{IO, AbstractString}, read_files::Bool,
end

finally
if read_files
if read_files # Always
close(xf)
else # Never
println("read_files isn't true!")
end
end

Expand Down Expand Up @@ -444,43 +436,31 @@ function internal_xml_file_read(xf::XLSXFile, filename::String) :: EzXML.Documen
@assert internal_xml_file_exists(xf, filename) "Couldn't find $filename in $(xf.source)."

if !internal_xml_file_isread(xf, filename)

@assert isopen(xf) "Can't read from a closed XLSXFile."
file_not_found = true
for f in xf.io.files
if f.name == filename
xf.files[filename] = true # set file as read

try
xf.data[filename] = EzXML.readxml(f)
catch err
@error("Failed to parse internal XML file `$filename`")
rethrow()
end

file_not_found = false
break
end
try
xf.data[filename] = EzXML.parsexml(ZipArchives.zip_readentry(xf.io, filename))
xf.files[filename] = true # set file as read
catch err
@error("Failed to parse internal XML file `$filename`")
rethrow()
end

if file_not_found
# shouldn't happen
error("$filename not found in XLSX package.")
end
end

return xf.data[filename]
end

function Base.close(xl::XLSXFile)
xl.io_is_open = false
close(xl.io)

# close(xl.io)
# close all internal file streams from worksheet caches
for sheet in xl.workbook.sheets
if sheet.cache != nothing && sheet.cache.stream_state != nothing
close(sheet.cache.stream_state)
end
end
# for sheet in xl.workbook.sheets
# if sheet.cache != nothing && sheet.cache.stream_state != nothing
# close(sheet.cache.stream_state)
# end
# end
end

Base.isopen(xl::XLSXFile) = xl.io_is_open
Expand Down Expand Up @@ -530,14 +510,13 @@ julia> XLSX.readdata("myfile.xlsx", "mysheet!A2:B4")
```
"""
function readdata(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, ref)
c = openxlsx(source, enable_cache=false) do xf
c = openxlsx(source, enable_cache=true) do xf
getdata(getsheet(xf, sheet), ref)
end
return c
end

function readdata(source::Union{AbstractString, IO}, sheetref::AbstractString)
c = openxlsx(source, enable_cache=false) do xf
c = openxlsx(source, enable_cache=true) do xf
getdata(xf, sheetref)
end
return c
Expand Down
20 changes: 8 additions & 12 deletions src/stream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,12 @@ end
Base.show(io::IO, state::SheetRowStreamIteratorState) = print(io, "SheetRowStreamIteratorState( is open = $(state.is_open) , row = $(state.row) )")

# Opens a file for streaming.
@inline function open_internal_file_stream(xf::XLSXFile, filename::String) :: Tuple{ZipFile.Reader, EzXML.StreamReader}
@inline function open_internal_file_stream(xf::XLSXFile, filename::String) :: Tuple{ZipArchives.ZipReader, EzXML.StreamReader}
@assert internal_xml_file_exists(xf, filename) "Couldn't find $filename in $(xf.source)."
@assert xf.source isa IO || isfile(xf.source) "Can't open internal file $filename for streaming because the XLSX file $(xf.filepath) was not found."
io = ZipFile.Reader(xf.source)

for f in io.files
if f.name == filename
return io, EzXML.StreamReader(f)
end
end
f = ZipArchives.zip_openentry(xf.io, filename)
return xf.io, EzXML.StreamReader(f)

error("Couldn't find $filename in $(xf.source).")
end
Expand All @@ -67,7 +63,7 @@ end
if isopen(s)
s.is_open = false
close(s.xml_stream_reader)
close(s.zip_io)
# close(s.zip_io)
end
nothing
end
Expand Down Expand Up @@ -102,7 +98,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo
elseif is_end_of_sheet_data(reader)
# this Worksheet has no rows
close(reader)
close(zip_io)
# close(zip_io)
return nothing
end
end
Expand All @@ -117,7 +113,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo

reader = state.xml_stream_reader
if is_end_of_sheet_data(reader)
@assert !isopen(state)
# @assert !isopen(state)
return nothing
else
@assert isopen(state) "Error processing Worksheet $(ws.name): Can't fetch rows from a closed workbook."
Expand All @@ -133,7 +129,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo
while EzXML.iterate(reader) != nothing

if is_end_of_sheet_data(reader)
close(state)
# close(state)
break
end

Expand All @@ -142,7 +138,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::Union{Nothing, SheetRo

while true
if is_end_of_sheet_data(reader)
close(state)
# close(state)
break
elseif EzXML.nodetype(reader) == EzXML.READER_ELEMENT && nodename(reader) == "row"
break
Expand Down
6 changes: 3 additions & 3 deletions src/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ Implementations: SheetRowStreamIterator, WorksheetCache.
abstract type SheetRowIterator end

mutable struct SheetRowStreamIteratorState
zip_io::ZipFile.Reader
zip_io::ZipArchives.ZipReader
xml_stream_reader::EzXML.StreamReader
is_open::Bool # indicated if zip_io and xml_stream_reader are opened
row::Int # number of current row. It´s set to 0 in the start state.
Expand Down Expand Up @@ -284,7 +284,7 @@ sh = xf["mysheet"] # get a reference to a Worksheet
mutable struct XLSXFile <: MSOfficePackage
source::Union{AbstractString, IO}
use_cache_for_sheet_data::Bool # indicates whether Worksheet.cache will be fed while reading worksheet cells.
io::ZipFile.Reader
io::ZipArchives.ZipReader
io_is_open::Bool
files::Dict{String, Bool} # maps filename => isread bool
data::Dict{String, EzXML.Document} # maps filename => XMLDocument
Expand All @@ -295,7 +295,7 @@ mutable struct XLSXFile <: MSOfficePackage

function XLSXFile(source::Union{AbstractString, IO}, use_cache::Bool, is_writable::Bool)
check_for_xlsx_file_format(source)
io = ZipFile.Reader(source)
io = ZipArchives.ZipReader(read(source))
xl = new(source, use_cache, io, true, Dict{String, Bool}(), Dict{String, EzXML.Document}(), Dict{String, Vector{UInt8}}(), EmptyWorkbook(), Vector{Relationship}(), is_writable)
xl.workbook.package = xl
finalizer(close, xl)
Expand Down
4 changes: 1 addition & 3 deletions src/worksheet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ function Worksheet(xf::XLSXFile, sheet_element::EzXML.Node)
name = sheet_element["name"]
is_hidden = haskey(sheet_element, "state") && sheet_element["state"] in ["hidden", "veryHidden"]
dim = read_worksheet_dimension(xf, relationship_id, name)

return Worksheet(xf, sheetId, relationship_id, name, dim, is_hidden)
end

Expand All @@ -26,7 +25,6 @@ end
# 18.3.1.35 - dimension (Worksheet Dimensions). This is optional, and not required.
function read_worksheet_dimension(xf::XLSXFile, relationship_id, name) :: Union{Nothing, CellRange}
local result::Union{Nothing, CellRange} = nothing

wb = get_workbook(xf)
target_file = get_relationship_target_by_id("xl", wb, relationship_id)
zip_io, reader = open_internal_file_stream(xf, target_file)
Expand All @@ -48,7 +46,7 @@ function read_worksheet_dimension(xf::XLSXFile, relationship_id, name) :: Union{
end
finally
close(reader)
close(zip_io)
# close(zip_io)
end

return result
Expand Down
17 changes: 12 additions & 5 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1354,14 +1354,21 @@ end
@test dt_read.column_labels == dt.column_labels
@test dt_read.column_label_index == dt.column_label_index
end
end

# delete files created by this testset
delete_files = ["output_table.xlsx", "output_tables.xlsx"]
for f in delete_files
isfile(f) && rm(f)
@testset "rm after read" begin
@testset "with readtable" begin
f = "output_table.xlsx"
dtable = XLSX.readtable(f, "report", "F")
@test isfile(f) && rm(f)==nothing
end
@testset "with readdata" begin
f = "output_tables.xlsx"
dtable = XLSX.readdata(f, "REPORT_A", "A1:B3")
@test isfile(f) && rm(f)==nothing
end
end

@testset "Styles" begin

using XLSX: CellValue, id, getcell, setdata!, CellRef
Expand Down
Loading