From 9fe4f3be5f04567a8273419c1a2d063c90383bf5 Mon Sep 17 00:00:00 2001 From: Stephen Steneker Date: Tue, 4 Feb 2020 15:10:38 +1100 Subject: [PATCH] Initial implementation of mtransfer tool (#754) --- README.rst | 11 +- doc/index.rst | 11 +- doc/install.rst | 10 +- doc/mlaunch.rst | 9 ++ doc/mlogfilter.rst | 9 ++ doc/mloginfo.rst | 9 ++ doc/mlogvis.rst | 9 ++ doc/mplotqueries.rst | 9 ++ doc/mtransfer.rst | 161 ++++++++++++++++++++ mtools/mtransfer/__init__.py | 0 mtools/mtransfer/mtransfer.py | 276 ++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- setup.py | 10 +- 13 files changed, 515 insertions(+), 11 deletions(-) create mode 100644 doc/mtransfer.rst create mode 100644 mtools/mtransfer/__init__.py create mode 100644 mtools/mtransfer/mtransfer.py diff --git a/README.rst b/README.rst index 13afd651..8c0220fa 100644 --- a/README.rst +++ b/README.rst @@ -6,7 +6,8 @@ mtools **mtools** is a collection of helper scripts to parse, filter, and visualize MongoDB log files (``mongod``, ``mongos``). mtools also includes ``mlaunch``, a -utility to quickly set up complex MongoDB test environments on a local machine. +utility to quickly set up complex MongoDB test environments on a local machine, +and ``mtransfer``, a tool for transferring databases between MongoDB instances. .. figure:: https://raw.githubusercontent.com/rueckstiess/mtools/develop/mtools.png :alt: mtools box @@ -26,7 +27,7 @@ The following tools are in the mtools collection: special sections like restarts, connections, distinct view `mplotqueries `__ - visualize log files with different types of plots (requires matplotlib) + visualize log files with different types of plots (requires ``matplotlib``) `mlogvis `__ creates a self-contained HTML file that shows an interactive visualization @@ -34,7 +35,11 @@ The following tools are in the mtools collection: `mlaunch `__ a script to quickly spin up local test environments, including replica sets - and sharded systems (requires pymongo) + and sharded systems (requires ``pymongo``) + +`mtransfer `__ + an experimental script to transfer databases between MongoDB instances by + copying WiredTiger data files (requires ``pymongo`` and ``wiredtiger``) For more information, see the `mtools documentation `__. diff --git a/doc/index.rst b/doc/index.rst index 1ab0f317..d23c4769 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -17,6 +17,7 @@ utility to quickly set up complex MongoDB test environments on a local machine. mloginfo.rst mlogvis.rst mplotqueries.rst + mtransfer.rst contributing.rst changelog.rst @@ -36,15 +37,19 @@ The following tools are in the mtools collection: special sections like restarts, connections, distinct view :ref:`mplotqueries` - visualize log files with different types of plots (requires matplotlib) + visualize log files with different types of plots (requires ``matplotlib``) :ref:`mlogvis` creates a self-contained HTML file that shows an interactive visualization - in a web browser (as an alternative to mplotqueries) + in a web browser (as an alternative to ``mplotqueries``) :ref:`mlaunch` a script to spin up local test environments quickly, including replica sets - and sharded systems (requires pymongo) + and sharded systems (requires ``pymongo``) + +:ref:`mtransfer` + an experimental script to transfer WiredTiger databases between MongoDB + instances by copying data files (requires ``pymongo`` and ``wiredtiger``) The `mtools source code `__ is available on GitHub under an `Apache 2.0 license diff --git a/doc/install.rst b/doc/install.rst index a5ee74eb..01591bee 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -109,7 +109,7 @@ processes. pymongo ------- -*required for mlaunch* +*required for mlaunch and mtransfer* `pymongo `__ is MongoDB's official Python driver. ``mlaunch`` uses this to configure and query local MongoDB @@ -131,3 +131,11 @@ numpy `numpy `__ is a Python module for scientific computing and numerical calculations. + +wiredtiger +---------- + +*required for mtransfer* + +`WiredTiger `__ is the default +storage engine for MongoDB. \ No newline at end of file diff --git a/doc/mlaunch.rst b/doc/mlaunch.rst index 8c0370b1..dce8a68f 100644 --- a/doc/mlaunch.rst +++ b/doc/mlaunch.rst @@ -693,3 +693,12 @@ Optional Parameters This command displays a list of all nodes, their status and port number, and in addition, their startup commands. + +Disclaimer +~~~~~~~~~~ + +This software is not supported by `MongoDB, Inc. `__ +under any of their commercial support subscriptions or otherwise. Any usage of +mtools is at your own risk. Bug reports, feature requests and questions can be +posted in the `Issues +`__ section on GitHub. \ No newline at end of file diff --git a/doc/mlogfilter.rst b/doc/mlogfilter.rst index 5a0068d8..0c5d6003 100644 --- a/doc/mlogfilter.rst +++ b/doc/mlogfilter.rst @@ -403,3 +403,12 @@ log file: .. code-block:: bash mlogfilter mongod.log --from "end -2h" --to +1h + +Disclaimer +~~~~~~~~~~ + +This software is not supported by `MongoDB, Inc. `__ +under any of their commercial support subscriptions or otherwise. Any usage of +mtools is at your own risk. Bug reports, feature requests and questions can be +posted in the `Issues +`__ section on GitHub. \ No newline at end of file diff --git a/doc/mloginfo.rst b/doc/mloginfo.rst index 21727e13..c945372a 100644 --- a/doc/mloginfo.rst +++ b/doc/mloginfo.rst @@ -392,3 +392,12 @@ For example: local1.myCollection insert None None None None invoice-prod.invoices insert 12768411 22233323 86313 12344 invoice-prod.invoices insert 12868411 22233323 86313 12344 + +Disclaimer +~~~~~~~~~~ + +This software is not supported by `MongoDB, Inc. `__ +under any of their commercial support subscriptions or otherwise. Any usage of +mtools is at your own risk. Bug reports, feature requests and questions can be +posted in the `Issues +`__ section on GitHub. \ No newline at end of file diff --git a/doc/mlogvis.rst b/doc/mlogvis.rst index 46a456b3..355ab567 100644 --- a/doc/mlogvis.rst +++ b/doc/mlogvis.rst @@ -42,3 +42,12 @@ Version ------- ``--version`` shows the version number and exits. + +Disclaimer +~~~~~~~~~~ + +This software is not supported by `MongoDB, Inc. `__ +under any of their commercial support subscriptions or otherwise. Any usage of +mtools is at your own risk. Bug reports, feature requests and questions can be +posted in the `Issues +`__ section on GitHub. \ No newline at end of file diff --git a/doc/mplotqueries.rst b/doc/mplotqueries.rst index c7ab66e2..37fa257f 100644 --- a/doc/mplotqueries.rst +++ b/doc/mplotqueries.rst @@ -571,3 +571,12 @@ Remove Overlays To remove all overlays, you can use this command. It will delete all existing overlays, and the next (or current, if a log file is specified as well) call to **mplotqueries** will not show additional overlays. + +Disclaimer +~~~~~~~~~~ + +This software is not supported by `MongoDB, Inc. `__ +under any of their commercial support subscriptions or otherwise. Any usage of +mtools is at your own risk. Bug reports, feature requests and questions can be +posted in the `Issues +`__ section on GitHub. diff --git a/doc/mtransfer.rst b/doc/mtransfer.rst new file mode 100644 index 00000000..83d6edf5 --- /dev/null +++ b/doc/mtransfer.rst @@ -0,0 +1,161 @@ +.. _mtransfer: + +========= +mtransfer +========= + +``mtransfer`` allows WiredTiger databases to be exported from one MongoDB +instance and imported into another. + + +Caveats +~~~~~~~ + +The ``mtransfer`` script is EXPERIMENTAL and has a number of important usage caveats: + +- MongoDB must be started with the ``--directoryperdb`` flag. +- ``mtransfer`` does not work with sharding, the encrypted storage engine, or + MMAPv1 data files. +- To export or import a database, MongoDB must not be running using either the + source or destination database paths. +- A database must be imported to all nodes in a replica set or query results + will be inconsistent. +- A database cannot be imported to any node in the replica set it was exported + from. Collections have unique identifiers, and this would violate that uniqueness. +- ``mtransfer`` currently only supports database files compressed with the + default `snappy` library. + +While there are some sanity checks built into the script, manipulating MongoDB +files directly is inherently dangerous. Take care to test and backup your data. + +Installation +~~~~~~~~~~~~ + +The ``mtransfer`` script requires the +`wiredtiger Python library `__ +which is currently not installed by default with ``mtools``. + +The ``wiredtiger`` library can be installed via ``pip`` or built from source, +but in either case requires: + +- A C compiler. +- The ``snappy`` and ``zlib`` development packages installed. + +To install via ``pip`` use either of: + +.. code-block:: bash + + pip install mtools[mtransfer] + +.. code-block:: bash + + pip install wiredtiger + +If you are encountering errors using or installing the ``wiredtiger`` module +via `pip`, you may need to `Build and install WiredTiger from source +`__. + +Usage +~~~~~ + +.. code-block:: bash + + mtransfer [-h] [--version] [--verbose] [--dbpath DBPATH] + {export,import} database + +General Parameters +~~~~~~~~~~~~~~~~~~ + +Help +---- +``-h, --help`` + shows the help text and exits. + +Version +------- +``--version`` + shows the version number and exits. + +Verbosity +--------- +``--verbose`` + shows extra information + +Commands +~~~~~~~~ + +``mtransfer`` reads or writes the data files in a MongoDB instance +using the WiredTiger storage engine. + +Database Path +------------- +``--dbpath path`` the path to the MongoDB database files (defaults to +the current working directory where the script is run). + +Command +------- +``export`` +creates a file ``mtransfer.bson`` in the specified database directory. +This must be copied along with the other files in the directory to the +destination server(s). + +``import`` reads the ``mtransfer.bson`` file from the specified database +directory and updates MongoDB's metadata to include the database. + +Database +--------- +The name of the database to export / import. The MongoDB database name +must match the directory name on disk for the export, and the MongoDB +database name will be set to the directory name for the import. + +Example Database Transfer and Rename +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On the origin server +-------------------- + +Before starting, ensure MongoDB is not running. + +1. Change into the origin directory and create 'olddb/mtransfer.bson' + with exported metadata: + +.. code-block:: bash + + cd /from/dbpath + mtransfer export olddb + +2. Copy the database files to the destination + +.. code-block:: bash + + rsync -av olddb destination:/to/dbpath + +On the destination server +------------------------- + +Before starting, ensure MongoDB is not running. + +3. Rename the database directory (optional) + +.. code-block:: bash + + cd /to/dbpath + mv olddb newdb + +4. Import the database (with the new name, if renamed) + +.. code-block:: bash + + mtransfer import newdb + +5. Start ``mongod`` and confirm the transferred database + is now available. + +Disclaimer +~~~~~~~~~~ + +This software is not supported by `MongoDB, Inc. `__ +under any of their commercial support subscriptions or otherwise. Any usage of +mtools is at your own risk. Bug reports, feature requests and questions can be +posted in the `Issues +`__ section on GitHub. \ No newline at end of file diff --git a/mtools/mtransfer/__init__.py b/mtools/mtransfer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mtools/mtransfer/mtransfer.py b/mtools/mtransfer/mtransfer.py new file mode 100644 index 00000000..222e05bf --- /dev/null +++ b/mtools/mtransfer/mtransfer.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python + +import argparse +import os +import re +import sys +import bson +import wiredtiger + +from mtools.util.cmdlinetool import BaseCmdLineTool +from mtools.version import __version__ + +codec_options = bson.codec_options.CodecOptions(uuid_representation=bson.binary.STANDARD) + + +class MTransferTool(BaseCmdLineTool): + def __init__(self): + BaseCmdLineTool.__init__(self) + + self.argparser.description = ('Import and export databases between MongoDB deployments ' + 'for WiredTiger storage with directoryPerDB configuration.') + + self.argparser.add_argument('--dbpath', dest='dbpath', default='.', nargs=1, + help='MongoDB database path') + + self.argparser.add_argument('--force', action='store_true', + help='ignore safety checks') + + self.argparser.add_argument('--verbose', action='store_true', + help='enable verbose output') + + self.argparser.add_argument('command', choices=['export', 'import']) + + self.argparser.add_argument('database', nargs=1, type=str, + help='name of the database to export / import') + + def run(self, arguments=None): + BaseCmdLineTool.run(self, arguments) + + self.dbpath = self.args['dbpath'][0] + self.force = self.args['force'] + self.verbose = self.args['verbose'] + + # Read storage.bson, sanity check. + try: + storage_raw = open(os.path.join(self.dbpath, 'storage.bson'), 'rb').read() + except Exception as e: + sys.stderr.write('Failed to open storage.bson in "{0}": {1}\n'.format(self.dbpath, e)) + return + + settings = bson.decode(storage_raw)["storage"]["options"] + if not settings["directoryPerDB"]: + sys.stderr.write('Requires a database created with --directoryperdb\n') + return + if settings["directoryForIndexes"] or settings.get("groupCollections", False): + sys.stderr.write('Incompatible storage settings detected: ' + 'directoryForIndexes or groupCollections\n') + if not self.force: + return + + self.database = self.args['database'][0] + self.nsprefix = self.database + '.' + + mtransfer_dir = os.path.join(self.dbpath, self.database) + mtransfer_file = os.path.join(mtransfer_dir, 'mtransfer.bson') + + if self.args['command'] == 'export': + if not os.path.exists(mtransfer_dir): + sys.stderr.write('Expected source directory "{0}" does not exist. ' + 'Check the database name is correct.\n'.format(mtransfer_dir)) + return + if not self.force and os.path.exists(mtransfer_file): + sys.stderr.write('Output file "{0}" already exists\n'.format(mtransfer_file)) + return + with open(mtransfer_file, 'wb') as outf: + self.doExport(outf) + elif self.args['command'] == 'import': + if not os.path.exists(mtransfer_dir): + sys.stderr.write('Expected target directory "{0}" does not exist. ' + 'Check the database name is correct.\n'.format(mtransfer_dir)) + return + if not os.path.exists(mtransfer_file): + sys.stderr.write('Cannot import: mtransfer file "{0}" does not exist.\n'. + format(mtransfer_file)) + return + with open(mtransfer_file, 'rb') as inf: + self.doImport(inf) + + def message(self, msg): + if self.verbose: + print(msg) + + def doExport(self, outf): + # Attempt to connect to the specified WiredTiger database + try: + conn = wiredtiger.wiredtiger_open( + self.dbpath, + 'log=(compressor=snappy,path=journal,recover=error),readonly=true') + except Exception as e: + sys.stderr.write('Failed to open dbpath "{0}": {1}\n'.format(self.dbpath, e)) + return + + session = conn.open_session() + + # Find all collections in the database + catalog = session.open_cursor('table:_mdb_catalog') + sizeStorer = session.open_cursor('table:sizeStorer') + wtMeta = session.open_cursor('metadata:') + wtMetaCreate = session.open_cursor('metadata:create') + for _, meta_raw in catalog: + meta = bson.decode(meta_raw, codec_options=codec_options) + ns = meta[u'ns'] + if not ns or not ns.startswith(self.nsprefix): + continue + assert ns == meta[u'md'][u'ns'] + + # Iterate through indexes first + indexes = {} + for idxName, idxIdent in meta[u'idxIdent'].items(): + ident = str('table:' + idxIdent) + filename = ident[len('table:'):] + '.wt' + file_ident = 'file:' + filename + wtmeta_file = wtMeta[file_ident] + wtmeta_table = wtMetaCreate[ident] + basename = filename[len(self.nsprefix):] + indexes[idxName] = {'filename': basename, + 'wtmeta_table': wtmeta_table, + 'wtmeta_file': wtmeta_file} + + collname = ns[len(self.nsprefix):] + ident = str('table:' + meta[u'ident']) + size = bson.decode(sizeStorer[ident.encode()]) + filename = ident[len('table:'):] + '.wt' + file_ident = 'file:' + filename + wtmeta_file = wtMeta[file_ident] + wtmeta_table = wtMetaCreate[ident] + basename = filename[len(self.nsprefix):] + export = { + 'collname': collname, + 'filename': basename, + 'mdb_catalog': meta, + 'sizeStorer': size, + 'wtmeta_table': wtmeta_table, + 'wtmeta_file': wtmeta_file, + 'indexes': indexes, + 'version': __version__, + } + self.message(str(export)) + outf.write(bson.encode(export, codec_options=codec_options)) + + conn.close() + + def doImport(self, inf): + try: + conn = wiredtiger.wiredtiger_open( + self.dbpath, + 'log=(enabled=false,compressor=snappy,path=journal,recover=error)') + except Exception as e: + sys.stderr.write('Failed to open dbpath "{0}": {1}\n'.format(self.dbpath, e)) + return + + try: + self._doImport(conn, inf) + except Exception as e: + sys.stderr.write('Import failed: {0}'.format(e)) + + print('Import complete') + + conn.close() + + def _doImport(self, conn, inf): + app_metadata_re = re.compile(r'app_metadata=\(.*?\)') + + session = conn.open_session() + session.begin_transaction() + + catalog = session.open_cursor('table:_mdb_catalog') + sizeStorer = session.open_cursor('table:sizeStorer') + wtMeta = session.open_cursor('metadata:', None, 'readonly=false') + + # Get the maximum file ID in the WT catalog: we will be appending + session.create('file:_mtransfer') + newfile_meta = wtMeta['file:_mtransfer'] + self.message('Got new file metadata "{0}"'.format(newfile_meta)) + session.drop('file:_mtransfer') + file_id = int(re.search(r',id=(\d+),', newfile_meta).group(1)) + + # Get the maximum ID in the MDB catalog: we will be appending + catalog.prev() + maxID = catalog.get_key() + + for export in bson.decode_file_iter(inf, codec_options=codec_options): + if not os.path.exists( + os.path.join(self.dbpath, self.database, export['filename'])): + sys.stderr.write( + 'File "{0}" referenced in export missing during import'. + format(export['filename'])) + if not self.force: + return + + if not self.force and export['version'] != __version__: + sys.stderr.write( + 'Database was exported with mtools version {0}, ' + 'current version {1} may not be compatible'. + format(export['version'], __version__)) + return + + # Figure out the new namespace + ns = self.database + '.' + export['collname'] + + # First process the indexes + idxIdent = {} + for idxName, idx in export['indexes'].items(): + ident = self.database + '/' + idx['filename'][:-3] + table_uri = 'table:' + ident + colgroup_uri = 'colgroup:' + ident + file_uri = 'file:' + ident + '.wt' + # Do a regular "session.create" for the table, then overwrite + # the "file:" metadata with the original + app_metadata = app_metadata_re.search(idx['wtmeta_file']).group(0) + # For older style index metadata, update the namespace + app_metadata = re.sub(r'"ns" : ".*?"', '"ns" : "{0}"'.format(ns), app_metadata) + self.message('For index "{0}", app_metadata = "{1}"'.format(idxName, app_metadata)) + wtMeta[table_uri] = (app_metadata + + ',colgroups=,collator=,columns=,key_format=u,value_format=u') + wtMeta[colgroup_uri] = (app_metadata + + ',collator=,columns=,source="' + file_uri + '",type=file') + wtMeta[file_uri] = (idx['wtmeta_file'] + ',' + app_metadata + + (',id={0:d}'.format(file_id))) + file_id += 1 + idxIdent[idxName] = ident + self.message('Adding index "{0}" with ident "{1}"'.format(idxName, ident)) + + # Figure out the WT URIs + ident = self.database + '/' + export['filename'][:-3] + table_uri = 'table:' + ident + colgroup_uri = 'colgroup:' + ident + file_uri = 'file:' + ident + '.wt' + + # Do a regular "session.create" for the table, then overwrite the + # "file:" metadata with the original + app_metadata = app_metadata_re.search(export['wtmeta_file']).group(0) + self.message('For collection "{0}", app_metadata = "{1}"'.format(ns, app_metadata)) + wtMeta[table_uri] = (app_metadata + + ',colgroups=,collator=,columns=,key_format=q,value_format=u') + wtMeta[colgroup_uri] = (app_metadata + + ',collator=,columns=,source="' + file_uri + '",type=file') + wtMeta[file_uri] = (export['wtmeta_file'] + ',' + app_metadata + + (',id={0:d}'.format(file_id))) + file_id += 1 + + sizeStorer[ident.encode()] = bson.encode(export['sizeStorer']) + + # Fix the catalog entry to refer to the new namespace and new table names + catalog_entry = export['mdb_catalog'] + catalog_entry[u'ns'] = ns + catalog_entry[u'md'][u'ns'] = ns + catalog_entry[u'ident'] = ident + catalog_entry[u'idxIdent'] = idxIdent + for i in range(len(catalog_entry[u'md'][u'indexes'])): + catalog_entry[u'md'][u'indexes'][i][u'spec'][u'ns'] = ns + maxID += 1 + self.message('Adding catalog entry {0} -> {1}'.format(maxID, catalog_entry)) + catalog[maxID] = bson.encode(catalog_entry, codec_options=codec_options) + + session.commit_transaction() + + +def main(): + tool = MTransferTool() + tool.run() + return 0 # we need to return an integer + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/requirements.txt b/requirements.txt index 4f3b48c6..133cf74a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,5 @@ ordereddict>=1.1 python-dateutil>=2.7 matplotlib>=3.1.1 numpy>=1.16.4 -pymongo>=3.8.0 +pymongo>=3.9.0 psutil>=5.6.3 diff --git a/setup.py b/setup.py index 220ec2d2..38698763 100644 --- a/setup.py +++ b/setup.py @@ -17,12 +17,14 @@ # simplify the default install experience, particularly where a build # toolchain is required. extras_requires = { - "all": ['matplotlib>=3.1.1', 'numpy>=1.16.4', 'pymongo>=3.8.0', 'psutil>=5.6.3'], - "mlaunch": ['pymongo>=3.8.0', 'psutil>=5.6.3'], + "all": ['matplotlib>=3.1.1', 'numpy>=1.16.4', 'pymongo>=3.9.0', + 'psutil>=5.6.3'], + "mlaunch": ['pymongo>=3.9.0', 'psutil>=5.6.3'], "mlogfilter": [], "mloginfo": ['numpy>=1.16.4'], "mlogvis": [], "mplotqueries": ['matplotlib>=3.1.1', 'numpy>=1.16.4'], + "mtransfer": ['pymongo==3.9.0', 'wiredtiger>=3.2.1'], } try: @@ -49,6 +51,7 @@ 'mtools.mlogvis', 'mtools.mplotqueries', 'mtools.mgenerate', + 'mtools.mtransfer', 'mtools.test', 'mtools.util', 'mtools.mlogfilter.filters', @@ -87,7 +90,8 @@ "mlogfilter=mtools.mlogfilter.mlogfilter:main", "mloginfo=mtools.mloginfo.mloginfo:main", "mlogvis=mtools.mlogvis.mlogvis:main", - "mplotqueries=mtools.mplotqueries.mplotqueries:main" + "mplotqueries=mtools.mplotqueries.mplotqueries:main", + "mtransfer=mtools.mtransfer.mtransfer:main", ], }, author='Thomas Rueckstiess',