diff --git a/docs/Makefile b/docs/Makefile index 298ea9e..f060517 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -16,4 +16,5 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + eukcc -h > _static/eukcc-help.txt + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/eukcc-help.txt b/docs/_static/eukcc-help.txt new file mode 100644 index 0000000..a5b658d --- /dev/null +++ b/docs/_static/eukcc-help.txt @@ -0,0 +1,61 @@ +usage: eukcc [-h] --db DB [--outdir OUTDIR] [--config CONFIG] [--ncores int] + [--ncorespplacer int] [--hmm HMM] [--training] [--proteins] + [--bed file.bed] [--force] [--keeptemp] [--fplace] [--noglob] + [--quiet] [--debug] [--HPA] [--nPlacements n] [--minGenomes n] + [--fullineage] [--minPlacementLikelyhood float] [--mindist n] + [--touch] [--gmes] [--pygmes] [--diamond DIAMOND] [--plot] [-v] + fasta + +Evaluate completeness and contamination of a MAG. Args that start with '--' +(eg. --db) can also be set in a config file (specified via --config). Config +file syntax allows: key=value, flag=true, stuff=[a,b,c] (for details, see +syntax at https://goo.gl/R74nmi). If an arg is specified in more than one +place, then commandline values override config file values which override +defaults. + +positional arguments: + fasta Run script on this bin (fasta file) + +optional arguments: + -h, --help show this help message and exit + --db DB Path to EukCC DB + --outdir OUTDIR, -o OUTDIR + Location for the output. Names will be prefixed using + the bin filenames + --config CONFIG, -c CONFIG + Config file to define parameters, YAML + --ncores int, -n int set number of cores for GeneMark-ES, pplacer and Hmmer + --ncorespplacer int Pplacer requires a lot of memory. If you want you can + set less cores for pplacer, which improves memory + consumption significantly + --hmm HMM run hmmer on all these HMMs instead + --training Run EukCC in training mode (needed to create a new + release of the DB) + --proteins Input fasta is proteins + --bed file.bed, -b file.bed + You can pass a bedfile of the protein location to omit + fragmented proteins being detected twice + --force, -f Force rerun of computation even if output is newer + than input. Don't resume previous run. + --keeptemp Keep all temporary files, by default EukCC will remove + some temp files + --fplace, -p Force rerun of placement and subsequent steps + --noglob, -g Do not expand paths using glob + --quiet, -q Silcence most output + --debug, -d Debug and thus ignore safety + --HPA Set placement method to HPA + --nPlacements n Set number of proteins to support location in tree + (default: 2) + --minGenomes n Minimal number of genomes to support a set (default: + 3) + --fullineage Output full lineage for MAGs + --minPlacementLikelyhood float + minimal pplacer likelyhood (default: 0.4) + --mindist n Distance to collapse hits (default: 2000) + --touch Do not run, but touch all output files + --gmes only run GeneMark-ES + --pygmes Use pygmes, will improve eukccs capability of running + on highly fragmented bins but will take longer + --diamond DIAMOND required to use pygmes option + --plot produce plots + -v, --version show program's version number and exit diff --git a/docs/index.rst b/docs/index.rst index d0accd5..04ad356 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,6 +13,7 @@ EukCC Install Tutorial + EukCC options FAQ diff --git a/docs/usage/options.rst b/docs/usage/options.rst new file mode 100644 index 0000000..7e4c54f --- /dev/null +++ b/docs/usage/options.rst @@ -0,0 +1,77 @@ +EukCC options explained +============================ + +EukCC in genome (default) mode +-------------------------------- +When launching EukCC without special +parameters, it will assume that a +genomic fasta file was passed as input. +Thus GeneMark-ES will be used +to predict proteins. + +.. code-block:: shell + + eukcc --db eukccdb -o . genome.fna + + +EukCC in protein mode +--------------------------- +If proteins for a genome or MAG were already predicted +using a nother pipeline, EukCC can be used to +estimate the completeness and contamination. + +For this EukCC requires at least the proteins as Fasta +file. Internally EukCC ignores repeated proteins that +occure very close to each other on a genomic level. This +is due to common gene prediction errors and subsequent +duplicated hits with profile hmms. Thus it is possible +to pass the genomic coordinates for each protein as a bed +file to EukCC. Such a file can easily be prepared from +gtf or gff files. If no bed file is provided, this step +will be skipped. + + +.. code-block:: shell + + eukcc --db eukccdb -o . \ + --protein genome.faa \ + --bed coordinates.bed + + + +EukCC using pygmes +--------------------------- +GeneMark-ES uses a self training step to generate a model +for the provided genome. Somtimes highly fragmented or +incomplete genomes can fail to provided enough training data +for GeneMark-ES model creation to succeed. + +In these cases EukCC can rely on pygmes to select a suitable +model from previous analysed genomes. This allows to estimate +genome completeness also for very incomplete genomes. + +For pygmes to select a suitable model the user needs to provide +a diamond data base with taxonomic information. +Such a database can be downloaded here: + +.. code-block:: shell + + wget -O uniref50_pygmes.dmnd http://ftp.ebi.ac.uk/pub/databases/metagenomics/eukcc/uniref50_20200213_tax.dmnd + +EukCC can then be launched with: + + +.. code-block:: shell + + eukcc --db eukccdb -o . \ + --pygmes \ + --diamond uniref50_pygmes.dmnd \ + genome.fna + + +EukCC -h +------------------------ + + +.. include:: ../_static/eukcc-help.txt +