-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[API Proposal]: APIs to support tar archives #65951
Comments
Tagging subscribers to this area: @dotnet/area-system-io Issue DetailsBackground and motivationCreating a new issue to get fresh feedback. Original tar proposal Tar is an old, stable and robust archiving format that is heavily used, particularly in the Unix world. The community has expressed interest in having .NET offer APIs that would allow creation, manipulation and extraction of tar files. The following proposal aims to satisfy the request. API Proposalnamespace System.Formats.Tar
{
// Easy to use straightforward archiving and extraction APIs.
public sealed static class TarFile
{
public static void CreateFromDirectory(string sourceDirectoryName, string destinationArchiveFileName, bool includeBaseDirectory);
public static void ExtractToDirectory(string sourceArchiveFileName, string destinationDirectoryName, bool overwriteFiles);
}
// Enum representing the entry types that can be detected from V7, Ustar, PAX and GNU.
public enum TarEntryType : byte
{
OldRegularFile = '\0', // Used exclusively by V7
// Used by all formats
RegularFile = '0',
HardLink = '1',
SymbolicLink = '2',
CharacterDevice = '3',
BlockDevice = '4',
Directory = '5',
Fifo = '6',
// Exclusively used by PAX
GlobalExtendedAttributes = 'g',
ExtendedAttributes = 'x',
// Exclusively used by GNU
ContiguousFile = '7',
DirectoryEntry = 'D',
LongLink = 'K',
LongPath = 'L',
MultiVolume = 'M',
RenamedOrSymlinked = 'N',
SparseFile = 'S',
TapeVolume = 'T',
}
// The formats these APIs will be able to read
public enum TarFormat
{
Unknown = 0, // For when an archive that is being read is not recognized
V7 = 1,
Ustar = 2,
Pax = 3,
Gnu = 4,
}
// For traversing entries in an existing tar archive
public sealed class TarReader : System.IDisposable
{
public TarReader(System.IO.Stream archiveStream, bool leaveOpen = false);
public System.Formats.Tar.TarFormat Format { get; }
public System.Collections.Generic.IReadOnlyDictionary<string, string>? GlobalExtendedAttributes { get; }
public void Dispose();
public System.Formats.Tar.TarEntry? GetNextEntry(bool copyData = false);
}
// For creating a tar archive
public sealed class TarWriter : System.IDisposable
{
public TarWriter(System.IO.Stream archiveStream, bool leaveOpen = false, System.Collections.Generic.ICollection<System.Collections.Generic.KeyValuePair<string, string>>? globalExtendedAttributes = null);
public TarWriter(System.IO.Stream archiveStream, System.Formats.Tar.TarFormat archiveFormat, bool leaveOpen = false);
public System.Formats.Tar.TarFormat Format { get; }
public void AddFile(string fileName, string? entryName);
public void Dispose();
public void WriteEntry(System.Formats.Tar.TarEntry entry);
}
// Abstract type to represent the header record's metadata fields
// These fields are found in all the tar formats
public abstract class TarEntry
{
internal TarEntry();
public int Checksum { get; }
public System.IO.Stream? DataStream { get; set; }
public System.Formats.Tar.TarEntryType EntryType { get; }
public int Gid { get; set; }
public long Length { get; }
public string LinkName { get; set; }
public System.IO.UnixFileMode Mode { get; set; }
public System.DateTimeOffset MTime { get; set; }
public string Name { get; set; }
public int Uid { get; set; }
public void ExtractToFile(string destinationFileName, bool overwrite);
public override string ToString();
}
// Allows instancing a V7 tar entry
public sealed class TarEntryV7 : System.Formats.Tar.TarEntry
{
public TarEntryV7(System.Formats.Tar.TarEntryType entryType, string entryName);
}
// Allows instancing a Ustar tar entry
// Contains four additional metadata fields found in Ustar
public sealed class TarEntryUstar : System.Formats.Tar.TarEntry
{
public TarEntryUstar(System.Formats.Tar.TarEntryType entryType, string entryName);
public int DeviceMajor { get; set; }
public int DeviceMinor { get; set; }
public string GName { get; set; }
public string UName { get; set; }
}
// Allows instancing a PAX tar entry
// Contains four additional metadata fields found in Pax
// as well as a dictionary that exposes the extended attributes found in the extra metadata entry the Pax format defines
public sealed class TarEntryPax : System.Formats.Tar.TarEntry
{
public TarEntryPax(System.Formats.Tar.TarEntryType entryType, string entryName, System.Collections.Generic.ICollection<System.Collections.Generic.KeyValuePair<string, string>>? extendedAttributes);
public int DeviceMajor { get; set; }
public int DeviceMinor { get; set; }
public System.Collections.Generic.IReadOnlyDictionary<string, string> ExtendedAttributes { get; }
public string GName { get; set; }
public string UName { get; set; }
}
// Allows instancing a GNU tar entry
// Contains six additional metadata fields found in GNU
public sealed class TarEntryGnu : System.Formats.Tar.TarEntry
{
public TarEntryGnu(System.Formats.Tar.TarEntryType entryType, string entryName);
public System.DateTimeOffset ATime { get; set; }
public System.DateTimeOffset CTime { get; set; }
public int DeviceMajor { get; set; }
public int DeviceMinor { get; set; }
public string GName { get; set; }
public string UName { get; set; }
}
}
namespace System.IO
{
[System.FlagsAttribute]
public enum UnixFileMode
{
None = 0,
OtherExecute = 1,
OtherWrite = 2,
OtherRead = 4,
GroupExecute = 8,
GroupWrite = 16,
GroupRead = 32,
UserExecute = 64,
UserWrite = 128,
UserRead = 256,
StickyBit = 512,
GroupSpecial = 1024,
UserSpecial = 2048,
}
} The tar archiving format's specification is best described in the FreeBSD man 5 page. The tar spec defines a set of rules to collect filesystem objects into a single stream of bytes. A tar archive consists of a series of 512-byte records, where the first record that represents a filesystem object (the "header") contains fixed-size metadata fields describing said object, and the subsequent records have the actual data of the file. When the data size is not a multiple of 512, it is always zero-padded to guarantee the next record (or "header" record) will be found on the next multiple of 512. The end of a tar archive is found if at least two zero-byte 512 records are found. Unlike the zip archiving+compression format, the tar format does not have a central directory. This means there is no way of knowing how many files a tar archive contains unless the whole archive is traversed. The tar format evolved over time, and currently there are four well known formats:
The default format for the writing APIs is proposed to be PAX. API UsageWe can split the APIs into different categories according to the type of usage they are intended for: Stream-less APIsThe // Generates a tar archive where all the entry paths are prefixed by the root directory 'SourceDirectory'
TarFile.CreateFromDirectory(sourceDirectoryName: "D:/SourceDirectory/", destinationArchiveFileName: "D:/destination.tar", includeBaseDirectory: true);
// Extracts the contents of a tar archive into the specified directory, but avoiding overwriting anything found inside
TarFile.ExtractToDirectory(sourceArchiveFileName: "D:/destination.tar", destinationDirectoryName: "D:/DestinationDirectory/", overwriteFiles: false); Reading an archive entry by entryThe FileStream archiveStream = File.Open("D:/archive.tar", FileMode.Open, FileAccess.Read); The only requirement to be able to iterate the entries of a stream representing a tar archive is that the stream is readable. using TarReader reader = new TarReader(archiveStream, leaveOpen: true);
Console.WriteLine($"Format: {reader.Format}");
if (reader.GlobalExtendedAttributes != null)
{
Console.WriteLine("Format is PAX");
}
TarEntry? entry;
while ((entry = reader.GetNextEntry()) != null)
{
Console.WriteLine($"Entry name: {entry.Name}, entry type: {entry.EntryType}");
entry.ExtractToFile(destinationFileName: Path.Join("D:/MyExtractionFolder/", entry.Name), overwrite: false);
} What if the passed stream is unseekable, like when it comes from the network? Then the user will have two option: They can read it as it arrives, but knowing that it will be lost when the next entry is read: public void ReadTarFromNetwork(NetworkStream archiveStream) // This stream is not seekable
{
using TarReader reader = new TarReader(archiveStream);
while ((entry = reader.GetNextEntry(copyData: false)) != null) // Not copying the data means it needs to be read now, before advancing the stream position
{
if (entry.EntryType is TarEntryType.RegularFile)
{
// This needs to be done now because the position pointer will not be able to seek back later
entry.ExtractToFile(destinationFileName: Path.Join("D:/MyExtractionFolder/", entry.Name), overwrite: false);
DoSomethingWithTheData(entry.DataStream); // This won't be possible since the data stream position pointer is at the end of the stream
}
}
} Or they can request to get the data preserved internally for reading later: public void ReadTarFromNetwork(NetworkStream archiveStream) // This stream is not seekable
{
List<TarEntry> entries = new List<TarEntry>();
using TarReader reader = new TarReader(archiveStream);
while ((entry = reader.GetNextEntry(copyData: true)) != null) // Copy the data internally for later usage
{
entries.Add(entry);
} // Stream position is now located at the end of the stream
foreach (TarEntry entry in entries)
{
if (entry.EntryType is TarEntryType.RegularFile)
{
// This is possible because the data was saved internally
entry.ExtractToFile(destinationFileName: Path.Join("D:/MyExtractionFolder/", entry.Name), overwrite: false);
// We can also inspect the data stream now
entry.DataStream.Seek(0, SeekOrigin.Begin);
DoSomethingWithTheData(entry.DataStream);
}
}
} Writing a new archiveThe user can generate archives using streams. FileStream archiveStream = File.Create("D:/archive.tar"); The archive can be created in V7 format: using TarWriter writerV7 = new TarWriter(archiveStream, TarFormat.V7); Or Ustar: using TarWriter writerUstar = new TarWriter(archiveStream, TarFormat.Ustar); Or Pax: using TarWriter writerPax1 = new TarWriter(archiveStream, TarFormat.Pax); // No Global Extended Attributes entry Or Pax with a Global Extended Attributes entry appended at the beginning: Dictionary<string, string> gea = new Dictionary<string, string>();
gea.Add("something", "global");
using TarWriter writerPaxGEA = new TarWriter(archiveStream, globalExtendedAttributes: dictionary); // Note there's no need to indicate the format, it's assumed Or GNU: using TarWriter writerGnu = new TarWriter(archiveStream, TarFormat.Gnu); The user can add entries in two ways. By indicating the path of the file to add, which will automatically detect the entry type of the file: // EntryType: Directory
writer.AddFile(fileName: "D:/IAmADirectory/", entryName: "IAmADirectory");
// EntryType: RegularFile (or if V7: OldRegularFile)
writer.AddFile(fileName: "D:/file.txt", entryName: "file.txt");
// In Unix, if the writer was opened in Ustar, Pax or Gnu, the user can also add fifo, block device and character device files to the archive
writer.AddFile(fileName: "/home/carlos/myfifo", entryName: "myfifo"); // EntryType: Fifo
writer.AddFile(fileName: "/home/carlos/myblockdevice", entryName: "myblockdevice"); // EntryType: BlockDevice
writer.AddFile(fileName: "/home/carlos/mycharacterdevice", entryName: "mychardevice"); // EntryType: CharDevice Or by manually constructing an entry. V7: TarEntryV7 entry = new TarEntryV7(entryType: TarEntryType.OldRegularFile, entryName: "file.txt");
entry.DataStream = File.Open("D:/file.txt", FileMode.Open, FileAccess.Read);
entry.Gid = 5;
entry.Uid = 7;
writerV7.WriteEntry(entry); Ustar: TarEntryUstar entry = new TarEntryUstar(entryType: TarEntryType.RegularFile, entryName: "file.txt");
entry.DataStream = File.Open("D:/file.txt", FileMode.Open, FileAccess.Read);
entry.Mode = UnixFileMode.UserRead | UnixFileMode.GroupRead | UnixFileMode.OtherRead;
entry.UName = "carlos";
entry.GName = "dotnet";
writerUstar.WriteEntry(entry); PAX: TarEntryPax entry = new TarEntryPax(entryType: TarEntryType.Directory, entryName: "directory", extendedAttributes: null); // No extended attributes, but the metadata header is created anyway
writerPax.WriteEntry(entry);
Dictionary<string, string> ea = new DictionaryString<ea, ea>();
ea.Add("atime", $"{DateTimeOffset.Now}");
ea.Add("ctime", $"{DateTimeOffset.Now}");
TarEntryPax entryWithEA = new TarEntryPax(entryType: TarEntryType.SymbolicLink, entryName: "symlink", extendedAttributes: ea);
entryWithEA.LinkName = "this/is/a/link/path";
writer.WriteEntry(entryWithEA); GNU: TarEntryGnu entry = new TarEntryGnu(entryType: TarEntryType.CharacterDevice, entryName: "chardevice");
entry.DeviceMajor = 444;
entry.DeviceMinor = 555;
entry.ATime = DateTimeOffset.Now;
entry.CTime = DateTimeOffset.Now;
writerGnu.WriteEntry(entry); Creating an archive using entries from another archiveThe absence of a central directory prevents updating existing entries. But this scenario should still be possible for the user if needed. It should be especially useful if the user wants to convert entries from one format to another. using TarReader reader = new TarReader(originStream); // The detected format of this archive should not matter
using TarWriter writer = new TarWriter(destinationStream, TarFormat.Pax);
TarEntry? entry;
while ((entry = reader.GetNextEntry(copyData: true)) != null)
{
writer.WriteEntry(entry); // Entries should be saved in PAX format, reading as much as possible from the passed entry in a different format
} Creating a tar.gz archiveWe already offer GZip stream APIs, so it should be relatively easy to compress a tar archive when manipulating streams. MemoryStream archiveStream = new MemoryStream();
using (TarWriter writer = new TarWriter(archiveStream, TarFormat.Pax, leaveOpen: true)) // Do not close stream on dispose
{
TarEntryPax entry = new TarEntryPax(entryType: TarEntryType.RegularFile, entryName: "file.txt");
writer.WriteEntry(entry);
} // Dispose triggers writing the empty records at the end of the archive
using FileStream compressedFileStream = File.Create("file.tar.gz");
using GZipStream compressor = new GZipStream(compressedFileStream, CompressionMode.Compress);
archiveStream.CopyTo(compressor); // After disposing these two, the tar.gz will be commited The reason why this proposal does not include Alternative DesignsWe were originally considering offering APIs that looked more similar to RisksThe complexity of the formats will require a lot of testing, particularly with rare files, files generated in unsupported/rare formats, or files containing rare entry types. The extraction APIs we offer should have a way to prevent risky behaviors like tar-bombs. There are entry types that are not supported the same way across platforms: block device, character device, fifos. This will have to be considered when extracting a file created in another OS. There are four rare entry types in the GNU format that would not be supported at the beginning due to their complexity and the difficulty to generate archives using the unix
But they can be addressed in later iterations and ignore the entry types in the meanwhile.
|
Thanks, @carlossanlop. What about async APIs? Do we need an async version of GetNextEntry, as presumably that does IO? What about WriteEntry? ExtractToFile? AddFile? etc. What about IAsyncDisposable? For UnixFileMode, how does this relate to anything we might add around chmod? cc: @eerhardt TarEntry isn't disposable but contains a Stream. Who's responsible for disposing the stream and when, for both reading and writing? |
ZipArchive doesn't have async APIs as well (although requested for years now PowerShell/PowerShell#1541). Either both get async APIs or none of them, anything else would be inconsistent. |
What are the exact use cases? "The community has expressed interest in having .NET offer APIs that would allow creation, manipulation and extraction of tar files." I think it should be good to get that more written out. |
@stephentoub I just realized one of my code examples is wrong, so I updated it. Let me clarify the behavior of the data stream:
Example of the last point (added to the proposal examples): // V7 uses OldRegularFile
TarEntryV7 entry = new TarEntryV7(entryType: TarEntryType.OldRegularFile, entryName: "file.txt");
using (FileStream dataStream = File.Open("D:/file.txt", FileMode.Open, FileAccess.Read))
{
entry.DataStream = dataStream;
writerV7.WriteEntry(entry);
} // The user created the data stream externally, so they need to dispose it themselves
// All other formats use RegularFile
TarEntryUstar entry = new TarEntryUstar(entryType: TarEntryType.RegularFile, entryName: "file.txt");
using (FileStream dataStream = File.Open("D:/file.txt", FileMode.Open, FileAccess.Read))
{
entry.DataStream = dataStream;
writerUstar.WriteEntry(entry);
} // The user created the data stream externally, so they need to dispose it themselves |
I need to think about async APIs a bit more. I mainly wanted to make sure the sync ones made sense. But I do couple of questions:
|
Define "later" 😄 We should review any async APIs as part of the design for the types and we should ship them as part of the same release as the rest of the type. If we want to add them in a separate PR, that's ok.
Think of it like this. Find any calls you're making to anything that might do I/O, e.g. any call to Stream.Read/Write, any call to StreamReader.ReadXx or StreamWriter.WriteXx, etc... any public API that can reach any of those calls should be considered for having an async variant. The goal is a dev using these APIs to read/write TAR files should never have to synchronously block doing I/O.
So to restate:
If so, I think that sounds ok. |
Below are the proposed async APIs I would append next to their sync versions: public abstract partial class TarEntry
{
// ... Everything else remains the same
public void ExtractToFile(string destinationFileName, bool overwrite);
+ public ValueTask ExtractToFileAsync(string, destinationFileName, bool overwrite, CancellationToken cancellationToken = default);
}
-public sealed partial class TarReader : IDisposable
+public sealed partial class TarReader : IDisposable, IAsyncDisposable
{
// ... Everything else remains the same
public void Dispose();
+ public ValueTask DisposeAsync();
public TarEntry? GetNextEntry(bool copyData = false);
+ public ValueTask<TarEntry?> GetNextEntryAsync(bool copyData = false, CancellationToken cancellationToken = default);
}
-public sealed partial class TarWriter : IDisposable
+public sealed partial class TarWriter : IDisposable, IAsyncDisposable
{
// ... Everything else remains the same
public void AddFile(string fileName, string? entryName);
+ public ValueTask AddFileAsync(string fileName, string? entryName, CancellationToken cancellationToken = default);
public void Dispose();
+ public ValueTask DisposeAsync();
public void WriteEntry(TarEntry entry);
+ public ValueTask WriteEntryAsync(TarEntry entry, CancellationToken cancellationToken = default);
}
public static partial class TarFile
{
public static void CreateFromDirectory(string sourceDirectoryName, string destinationArchiveFileName, bool includeBaseDirectory);
+ public static ValueTask CreateFromDirectoryAsync(string sourceDirectoryName, string destinationArchiveFileName, bool includeBaseDirectory, CancellationToken cancellationToken = default);
public static void ExtractToDirectory(string sourceArchiveFileName, string destinationDirectoryName, bool overwriteFiles);
+ public static ValueTask ExtractToDirectoryAsync(string sourceArchiveFileName, string destinationDirectoryName, bool overwriteFiles, CancellationToken cancellationToken = default);
} One thing mentioned in the API usage section was that when the user creates a |
|
... I half wonder if we should have a convenience public void AddStream(System.IO.Stream stream, string entryName);
public ValueTask AddStreamAsync(System.IO.Stream stream, string entryName, CancellationToken cancellationToken = default); ... presumably the |
@Clockwork-Muse well, if the user needs to add a tar entry with the contents of a stream, it basically means they want to add a regular file entry. The most common use case is that where they need to add a regular file entry from an actual filesystem object, and that's why we are already suggesting the The So having an |
This is false. |
@Clockwork-Muse It's not false, but maybe I wasn't clear enough. Allow me to elaborate: If a user needs to create an entry that will contain a non-null The scenario you described can be achieved like this with the current APIs, all in-memory, with no interaction with the filesystem: private Stream CreateArchiveWithOneRegularFileEntry(Stream entryData)
{
MemoryStream archiveStream = new MemoryStream();
using (TarWriter writer = new TarWriter(archiveStream, TarFormat.Pax, leaveOpen=true)
{
TarEntryPax entry = new TarEntryPax(TarEntryType.RegularFile, "file.txt");
// ... here's where the user can change the other entry fields like Mode, Extended Attributes, etc. ...
entry.DataStream = entryData; // RegularFile is the only entry type that would allow setting the DataStream
writer.WriteEntry(entry);
}
return archiveStream;
} |
Ah, that's what you were referring to. |
@Clockwork-Muse The trouble, as I understand it, is that from "just a stream" there's not enough information to set the uid/gid/LMT/modebits/etc file entry attributes. Presumably some sort of defaults will apply when going the manual create-from-a-stream approach, but at least there it has the chance of being a bit more in-your-face that it's something you might want to set. |
For the async APIs: My understanding, which I'm always clear that I defer to @stephentoub here, is that unless you're @stephentoub and you wrote down the non-generic ValueTask you meant instead non-generic Task. The only generic one (GetNextEntryAsync) probably stands a chance of not yielding every now and then, so being |
Next steps:
@stephentoub would you mind giving us your input on no. 2? And any other feedback is also welcome, if you have any. |
I mostly agree with Jeremy's take. GetNextEntryAsync should return |
Looking at this API, it seems to me that it tries to precisely model the supported tar formats. But I feel like this precision comes at the cost of convenience and simplicity: it requires the user to understand the details of the formats, when the API could hide these to some extent. Some examples:
Adjusting the API this way would make it worse for those who care about the minutia of the tar formats, but I think that's an acceptable tradeoff, to make it easier for the common use. |
The main PR has been merged. Reopening this issue since I still need to implement the async APIs. |
API proposal: dotnet#65951
* Implement Tar APIs (#67883) API proposal: #65951 * Add assembly to NetCoreAppLibrary.props * Remove <PackageDescription> from src csproj since it is not OOB. * Add NetCoreAppCurrent to src csproj TargetFrameworks * Additional src csproj changes. * Allow sharing of input tar file for read Co-authored-by: carlossanlop <carlossanlop@users.noreply.github.com> Co-authored-by: Dan Moseley <danmose@microsoft.com>
@dotnet/area-meta should there be an |
@carlossanlop is there a reason you marked area-System.IO? shouldn't this an all tar issues/PR's be area-System.IO.Compression? (and then @dakersnar might be a good time to retrain) |
@danmoseley the tar format does not offer any compression, it is only an archiving format. It's the reason why we decided to put it in the System.Formats.Tar assembly and namespace. I have been adding all the tar issues and PRs into System.IO for that reason, and also because we do not have a System.Formats.Tar area label, and I don't think we currently want one. |
Sounds good. Nit: the work is listed in the compression epic. Still I expect there is enough material that retraining should be helpful |
@carlossanlop I was playing with the API and hit a snag when trying to create a using (FileStream fs = File.Create(tempTarballPath))
using (GZipStream gz = new(fs, CompressionMode.Compress))
using (TarWriter writer = new(gz))
{
...
} It crashes with
Is that something that can/should be expected to work? In your example you tar to a |
It's a bug. I opened #70172 to get it fixed. Thanks for reporting it, @rainersigwald |
Here's the fix: #70178 |
Closing since all the APIs in the initial proposal have been implemented, sync and async, with their subsequent refactorings to address multi-format archives and Global Extended Attributes entries. |
Nice - yay @carlossanlop ! |
Background and motivation
Creating a new issue to get fresh feedback. Original tar proposal
Tar is an old, stable and robust archiving format that is heavily used, particularly in the Unix world.
The community has expressed interest in having .NET offer APIs that would allow creation, manipulation and extraction of tar files. The following proposal aims to satisfy the request.
API Proposal
The tar archiving format's specification is best described in the FreeBSD man 5 page.
The tar spec defines a set of rules to collect filesystem objects into a single stream of bytes. A tar archive consists of a series of 512-byte records, where the first record that represents a filesystem object (the "header") contains fixed-size metadata fields describing said object, and the subsequent records have the actual data of the file. When the data size is not a multiple of 512, it is always zero-padded to guarantee the next record (or "header" record) will be found on the next multiple of 512. The end of a tar archive is found if at least two zero-byte 512 records are found.
Unlike the zip archiving+compression format, the tar format does not have a central directory. This means there is no way of knowing how many files a tar archive contains unless the whole archive is traversed.
The tar format evolved over time, and currently there are four well known formats:
1979 Version 7 AT&T Unix Tar Command Format. Known as "V7". This format supports regular files, directories, symbolic links and hard links. Filenames and linknames are limited to 100 bytes.
POSIX IEEE 1003.1-1988 Unix Standard Tar Format. Known as "Ustar". This format was an improvent of V7, so in a way it's backwards compatible with it. The main improvements were:
POSIX IEEE 1003.1-2001 ("POSIX.1") Pax Interchange Tar Format. Known as "PAX". This is the standard format, the most flexible, and the one with the least limitations It's built on top of ustar, so it's backwards compatible with both ustar and V7. Advantages:
GNU Tar Format.
Other formats: There is little documentation about them (schilly tar, gnu tar pax, aix tar, solaris tar, macosx tar) so these APIs should be able to read the archives and extract as best as possible, but would not be able to write them.
The default format for the writing APIs is proposed to be PAX.
Here's a table I created showing the differences between formats:
API Usage
We can split the APIs into different categories according to the type of usage they are intended for:
Stream-less APIs
The
TarFile
static class allows to easily archive the contents of a directory or extract the contents of a tar archive without any need to manipulate streams:Reading an archive entry by entry
The
TarReader
class allows reading an existing tar archive represented by a stream:The only requirement to be able to iterate the entries of a stream representing a tar archive is that the stream is readable.
The archive format should be immediately detected upon creation of the reader, even when the first entry has not been read by the user yet.
If the Global Extended Attributes dictionary is not null, it's safe to assume the archive format is PAX, since it's the only format that supports them.
If leaveOpen is passed to the constructor, the stream is not disposed when the reader is disposed.
The streams created to wrap the data section of an entry are automatically disposed when the reader is disposed.
What if the passed stream is unseekable, like when it comes from the network? Then the user will have two option:
They can read it as it arrives, but knowing that it will be lost when the next entry is read:
Or they can request to get the data preserved internally for reading later:
Writing a new archive
The user can generate archives using streams.
The archive can be created in V7 format:
Or Ustar:
Or Pax:
Or Pax with a Global Extended Attributes entry appended at the beginning:
Or GNU:
The user can add entries in two ways.
By indicating the path of the file to add, which will automatically detect the entry type of the file:
Or by manually constructing an entry.
Notice that
OldRegularFile
(V7 only) andRegularFile
(all other formats) are the only two entry types the user can create with a data section. To do that, they need to save a stream in theDataStream
property containing the information to write, and then they need to dispose it.V7:
Ustar:
PAX:
GNU:
Creating an archive using entries from another archive
The absence of a central directory prevents updating existing entries. But this scenario should still be possible for the user if needed. It should be especially useful if the user wants to convert entries from one format to another.
Creating a tar.gz archive
We already offer GZip stream APIs, so it should be relatively easy to compress a tar archive when manipulating streams.
The reason why this proposal does not include
TarFile
APIs to enable compression support, is because we first need to decide how to standardize the compression configuration pattern for all the compression formats we support. This is being discussed here: #42820Alternative Designs
We were originally considering offering APIs that looked more similar to
ZipArchive
, but the absence of a central directory and the mixture of writing and reading tasks would make the APIs very difficult to use, especially due to the existence of an "Update" mode. In Zip, the presence of a central directory helps with the complexities of modifying an existing archive, but in tar, not knowing the entries in advance makes it extremely complicated, especially with huge files or with unseekable streams. That proposal was discussed in the old tar issue.Risks
The complexity of the formats will require a lot of testing, particularly with rare files, files generated in unsupported/rare formats, or files containing rare entry types.
The extraction APIs we offer should have a way to prevent risky behaviors like tar-bombs.
There are entry types that are not supported the same way across platforms: block device, character device, fifos. This will have to be considered when extracting a file created in another OS.
There are four rare entry types in the GNU format that would not be supported at the beginning due to their complexity and the difficulty to generate archives using the unix
tar
tool for testing them. For example:Contiguous files ('7'): The documentation states that this entry type should be treated as "regular file" except on one obscure "RTOS" (Real-Time Operating System, the spec does not say which) where this entry type is used to indicate the pre-allocation of a contiguous file on disk.
Multi-volume files ('M'): Allows splitting a file into different archives. To add support to this entry type, new APIs would be required, particularly on
TarFile
, to ensure multiple files can be grouped into one single extraction.Files to be renamed or symlinked after extraction ('N'): This entry type is no longer generated by the 'GNU' tar due to security concerns.
Sparse regular files ('S'): Fragmented files that are stored split among multiple entries with this entry type.
Tape/volume header name ('V'): The spec says this entry type is ignored.
But they can be addressed in later iterations and ignore the entry types in the meanwhile.
The text was updated successfully, but these errors were encountered: