from nbdev import nbdev_export
nbdev_export()pygengraph
Install
Enter the directory of the library and enter:
pip install .
and for development use
pip install -e .[dev]
How to use
Use command line interface pantograph
After this package is installed, a command line tool pantograph becomes available.
It has several functions as following
Converting annotations to gene graph (see user manual on what is gene graph).
This is done by a command
$ pantograph annotation2graph [-h] [-g] setting_file.yamlIt requires a path to a yaml file with all settings for the process. If used with -g option, then a sample file will be generated for you to edit and then run it. Sample file has extensive comments explaining every parameter.
So, it is best to do the following:
$ pantograph annotation2graph -g setting.yamlthen, using your favourite text editor, edit the generated file and then run
$ pantograph annotation2graph setting.yamlConverting a file(s) with paths into gene/block graph
This is done by a command
$ pantograph paths2graph [-h] [-g] setting_file.yamlIt requires a path to a yaml file with all settings for the process. If used with -g option, then a sample file will be generated for you to edit and then run it. Sample file has extensive comments explaining every parameter.
So, it is best to do the following:
$ pantograph paths2graph -g setting.yamlthen, using your favourite text editor, edit the generated file and then run
$ pantograph paths2graph setting.yamlSorting a graph
In order to sort a graph, it should be in GFA v1 format file. To run the sorting, you need to use the following command:
$ pantograph sort-graph [-h] [--quiet] [--isseq] [--output OUTPUT] inputwith the following parameters
positional arguments:
input Relative (to current directory) or absolute path to the GFA file with the graph to be sorted.
optional arguments:
-h, --help show this help message and exit
--quiet, -q Suppress most of output. False (i.e. verbose) is not set.
--isseq, -s Does this graph contains nucleotide sequences. False is not set.
--output OUTPUT, -o OUTPUT File path where to save sorted graph. If not set, the input will be overwritten.
Exporting graph into visualisation data structure, which can be used by Pantograph visualisation tool.
This is done by a command
$ pantograph export-vis [-h] [-g] setting_file.yamlIt requires a path to a yaml file with all settings for the process. If used with -g option, then a sample file will be generated for you to edit and then run it. Sample file has extensive comments explaining every parameter.
So, it is best to do the following:
$ pantograph export-vis -g setting.yamlthen, using your favourite text editor, edit the generated file and then run
$ pantograph export-vis setting.yamlUse python package
The rest of the file describes some of the uses of the pyGenGraph package. There are more ways to use it, but more detailed documentation is needed to describe all use cases. Also, more things required for this package to become really universal.
import os
import glob
import re
import time
from pygengraph.graph import GenomeGraph
from pygengraph.utils import pathFileToPathDict
from pygengraph.export import exportProjectIf redis database needs to be cleaned.
from redis import Redis
redisConn = Redis(host=‘redis’,port=6379)
redisConn.flushall()
redisConn.close()
del redisConn
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('always',category=RuntimeWarning)Generating graphs
Generating from annotation
Preparing list of files
refdir = '/path/to/reference/'
annotationdir = '/path/to/annotation'
gfadir = '/path/to/graphs'annotationFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*.gff'))
pangenomeFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*pangen.gff'))
# If you want to include sequences instead of simple notion of genes.
# It should also be converted to sequenceFileDict, see details in documentation for GenomeGraph Class constructor.
# sequenceFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}sequences{os.path.sep}*.fasta'))
refAnnotationFile = f'{refdir}{os.path.sep}reference.gff'
# If you want to include sequences instead of simple notion of genes
# refSequenceFile = f'{refdir}{os.path.sep}reference.fasta'refdir = '../../1001G/annotations/freeze2.1/outgroups'
annotationdir = '../../1001G/annotations/freeze2.1'
gfadir = '../../1001G/annotations/graphs'annotationFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*.gff'))
# pangenomeFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}*pangen.gff'))
# If you want to include sequences instead of simple notion of genes.
# It should also be converted to sequenceFileDict, see details in documentation for GenomeGraph Class constructor.
# sequenceFiles = sorted(glob.glob(f'{annotationdir}{os.path.sep}sequences{os.path.sep}*.fasta'))
refAnnotationFile = f'{refdir}{os.path.sep}araport.gff'
# If you want to include sequences instead of simple notion of genes
# refSequenceFile = f'{refdir}{os.path.sep}reference.fasta'Generaton of gene graph
doUS = False
n = 1
for chrnum in range(1,n+1): # here n is number of chromosomes.
chromosome = f'Chr{chrnum}'
print(f'\nProcessing {chromosome}\n============')
curtst = time.time()
graph = GenomeGraph(annotationFiles = annotationFiles,
pangenomeFiles = None,
sequenceFilesDict = None,
doUS = doUS,
chromosome = chromosome,
refAnnotationFile=refAnnotationFile,
refAccession='TAIR10')
print(f'Generating graph for {chromosome} took {time.time() - curtst} seconds')
curtst = time.time()
graph.treeSort()
print(f'Sorting graph for {chromosome} took {time.time() - curtst} seconds')
if len(graph.nodes)!=len(graph.order):
print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
gfaFilename = f'Gene_{chromosome}_simOnly_unordered.gfa'
graph.order = list(range(1,len(graph.nodes)+1))
else:
gfaFilename = f'Gene_{chromosome}_simOnly.gfa'
graph.toGFA(f'{gfadir}{os.path.sep}{gfaFilename}',doSeq=False)Loading Pathfile to graph
# For path file v1
pathfileDir = 'examples/gene_graph'
pathsfile = 'paths_genegraph.txt'
paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}', True, True, False)
graph = GenomeGraph(pathsDict=paths)
graph.treeSort()
if len(graph.nodes)!=len(graph.order):
print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
output = 'paths_genegraph_unordered.gfa'
graph.order = list(range(1,len(graph.nodes)+1))
graph.toGFA(output,doSeq=False)
else:
coreGFApath = f'{pathfileDir}{os.path.sep}paths_genegraph.gfa'
graph.toGFA(coreGFApath,doSeq=False)# For v2
# This is example, no v2 file currently available for demonstration.
pathfileDir = '/path/to/file'
pathsfile = f'paths.txt'
paths = pathFileToPathDict(f'{pathfileDir}{os.path.sep}{pathsfile}',True,'reference',True)
for seqNum in paths.keys():
graph = GenomeGraph(pathsDict=paths[seqNum])
# On undirected coregraph sorting is not optimal! Check sorting!!!
graph.treeSort()
if len(graph.nodes)!=len(graph.order):
print('Sorting failed and not all nodes were sorted. Saving unsorted graph')
output = f'{pathfileDir}{os.path.sep}graph_Chr{seqNum}_unordered.gfa'
graph.order = list(range(1,len(graph.nodes)+1))
graph.toGFA(output,doSeq=False)
else:
coreGFApath = f'{pathfileDir}{os.path.sep}graph_Chr{seqNum}.gfa'
graph.toGFA(coreGFApath,doSeq=False)Loading graph from GFA and sorting it
gfadir = 'examples/nucleotide_graph'
# It is nucleotide graph. If it is not nucleotide graph, then `isSeq` variable should be changed to False.
gfafilename = 'paths_presentation.gfa'
isSeq = True
graph = GenomeGraph(gfaPath=f'{gfadir}{os.path.sep}{gfafilename}',isGFASeq=isSeq)
graph.treeSort()
assert len(graph.nodes)==len(graph.order)
basename,ext = os.path.splitext(gfafilename)
graph.toGFA(f'{gfadir}{os.path.sep}{basename}_ordered{ext}',doSeq=isSeq)Exporting to Pantograph visualisation
projectID = 'paths_genegraph'
projectName = 'Example gene graph'
pathToGraphs = 'examples/gene_graph'
caseDict = {'Main': 'paths_genegraph.gfa'}
pathToIndex = 'examples/Visdata'
# This is if you run it in Docker compose together with active Redis image, which is named "redis".
# If you have separate redis server, enter full address here.
# If you do not want to add any annotation, `redisHost` should be None.
redisHost = 'redis'
redisPort = 6379
redisDB = 0
suffix = ''
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = False
zoomLevels = [1,2,4]
fillZoomLevel = True
exportProject(projectID, projectName, caseDict, pathToIndex, pathToGraphs,
redisHost = redisHost, redisPort = redisPort, redisDB = redisDB,
suffix = suffix,
maxLengthComponent = maxLengthComponent, maxLengthChunk = maxLengthChunk,
inversionThreshold = inversionThreshold,
isSeq = isSeq,
zoomLevels = zoomLevels, fillZoomLevel = fillZoomLevel)projectID = 'tutorial_graph'
projectName = 'Example nucleotide graph'
pathToGraphs = 'examples/nucleotide_graph'
caseDict = {'Main': 'paths_presentation_ordered.gfa'}
pathToIndex = 'examples/Visdata'
# This is if you run it in Docker compose together with active Redis image, which is named "redis".
# If you have separate redis server, enter full address here.
# If you do not want to add any annotation, `redisHost` should be None.
redisHost = 'redis'
redisPort = 6379
redisDB = 0
suffix = ''
maxLengthComponent = 100
maxLengthChunk = 6
inversionThreshold = 0.5
isSeq = True
zoomLevels = [1,2,4]
fillZoomLevel = True
exportProject(projectID, projectName, caseDict, pathToIndex, pathToGraphs,
redisHost = redisHost, redisPort = redisPort, redisDB = redisDB,
suffix = suffix,
maxLengthComponent = maxLengthComponent, maxLengthChunk = maxLengthChunk,
inversionThreshold = inversionThreshold,
isSeq = isSeq,
zoomLevels = zoomLevels, fillZoomLevel = fillZoomLevel)