Source code for bioturing_connector.bbrowserx_connector

"""Python package for submitting/getting data from BBrowserX"""

from pathlib import Path

from typing import List
from typing import Union

from .common import common
from .common import get_uuid

from .typing import Species
from .typing import StudyType
from .typing import ChunkSize
from .typing import InputMatrixType

from .connector import Connector


[docs] class BBrowserXConnector(Connector): """ Create a connector object to submit/get data from BBrowserX Parameters ---------- host : `str` The URL of the BBrowserX server, only support HTTPS connection\n Example: https://talk2data.bioturing.com/t2d_index_tool/ token : `str` The API token to verify authority. Generated in-app. """ def _check_study_type(self, study_type): if study_type not in [ StudyType.BBROWSER.value, StudyType.H5_10X.value, StudyType.H5AD.value, StudyType.MTX_10X.value, StudyType.BCS.value, StudyType.RDS.value, StudyType.TSV.value, StudyType.TILE_DB.value, ]: raise Exception('Unsupported study_type, please recheck the imported connector \n(should be BBrowserXConnector for this study_type)') def _submit_study( self, group_id: str, study_id: str = None, name: str = 'TBD', authors: List[str] = [], abstract: str = '', species: str = Species.HUMAN.value, input_matrix_type: str = InputMatrixType.NORMALIZED.value, study_type: int = StudyType.H5AD.value, min_counts: int = None, min_genes: int = None, max_counts: int = None, max_genes: int = None, mt_percentage: Union[int, float] = None, skip_dimred: bool = False, ): if study_id is None: study_id = get_uuid() if min_counts is None: min_counts = 0 if min_genes is None: min_genes = 0 if max_counts is None: max_counts = 1e9 if max_genes is None: max_genes = 1e9 if mt_percentage is None: mt_percentage = 100 study_info = { 'study_hash_id': study_id, 'name': name, 'authors': authors if authors else [], 'abstract': abstract } return { 'species': species, 'group_id': group_id, 'filter_params': { 'min_counts': min_counts, 'min_genes': min_genes, 'max_counts': max_counts, 'max_genes': max_genes, 'mt_percentage': mt_percentage / 100 }, 'study_type': study_type, 'normalize': input_matrix_type == InputMatrixType.RAW.value, 'subsample': -1, 'skip_dimred': skip_dimred, 'study_info': study_info, }
[docs] def submit_study_from_s3( self, group_id: str, s3_id: str = None, batch_info: List[dict] = [], study_id: str = None, name: str = 'TBD', authors: List[str] = [], abstract: str = '', species: str = Species.HUMAN.value, input_matrix_type: str = InputMatrixType.NORMALIZED.value, study_type: int = StudyType.H5AD.value, min_counts: int = None, min_genes: int = None, max_counts: int = None, max_genes: int = None, mt_percentage: Union[int, float] = None, skip_dimred: bool = False ): """ Submit one or multiple datasets from s3 bucket to BBrowserX. Parameters ---------- group_id : str ID of the group to submit the data to. s3_id : str, Optional ID of s3 bucket. Default: None\n If s3_id is not provided, we will use the first s3 bucket configured on the platform. batch_info : List[dict] File path and batch name information, the path DOES NOT included the bucket path!\n Example: For h5ad format: [{ 'matrix': 's3_path/GSE128223_1.h5ad' }, {...}] For mtx format: [{ 'matrix': 's3_path/data_1/matrix.mtx',\n 'features': 's3_path/data_1/features.tsv',\n 'barcodes': 's3_path/data_1/barcodes.tsv', }, {...}] For tiledb format: [{ 'folder': 's3_path/GSE128223_1' }, {...}] study_id : str, optional Will be the displaying name of study (eg: PBMC_3K). Default: uuidv4 name : str, optional Name of the study. Default: 'TBD' authors : List[str], optional Authors of the study. Default: [] abstract : str, optional Abstract of the study. Default: '' species : bioturing_connector.typing.Species, optional Species of the study. Default: 'human'\n Support: bioturing_connector.typing.Species.HUMAN.value\n bioturing_connector.typing.Species.MOUSE.value\n bioturing_connector.typing.Species.NON_HUMAN_PRIMATE.value\n bioturing_connector.typing.Species.OTHERS.value\n skip_dimred : Bool, optional Skip BioTuring pipeline if set to True. Default: False\n (only applicable when input is a scanpy/seurat object). input_matrix_type : bioturing_connector.typing.InputMatrixType, optional Is the input matrix already normalized or not?. Default: 'normalized'\n Support: bioturing_connector.typing.InputMatrixType.NORMALIZED.value (will skip BioTuring normalization, h5ad: use adata.X) bioturing_connector.typing.InputMatrixType.RAW.value (apply BioTuring normalization, h5ad: use adata.raw.X) study_type : bioturing_connector.typing.StudyType, opitonal Format of the study. Default: bioturing_connector.typing.StudyType.H5AD.value\n Support: bioturing_connector.typing.StudyType.BBROWSER.value\n bioturing_connector.typing.StudyType.H5_10X.value\n bioturing_connector.typing.StudyType.H5AD.value\n bioturing_connector.typing.StudyType.MTX_10X.value\n bioturing_connector.typing.StudyType.BCS.value\n bioturing_connector.typing.StudyType.RDS.value\n bioturing_connector.typing.StudyType.TSV.value\n bioturing_connector.typing.StudyType.TILE_DB.value\n min_counts : int, optional Minimum number of counts required for a cell to pass filtering. Default: 0 min_genes : int, optional Minimum number of genes expressed required for a cell to pass filtering. Default: 0 max_counts : int, optional Maximum number of counts required for a cell to pass filtering. Default: inf max_genes : int, optional Maximum number of genes expressed required for a cell to pass filtering. Default: inf mt_percentage : int, optional Maximum number of mitochondria genes percentage required for a cell to pass filtering. Default: 100\n Ranging from 0 to 100\n Returns ---------- Submission status : bool | str True or Error log """ self._check_study_type(study_type) data = self._submit_study( group_id, study_id, name, authors, abstract, species, input_matrix_type, study_type, min_counts, min_genes, max_counts, max_genes, mt_percentage, skip_dimred, ) if study_type == StudyType.MTX_10X.value: for i, o in enumerate(batch_info): name = o['matrix'].split('/') if len(name) == 1: o['name'] = f'Batch {i + 1}' else: o['name'] = name[-2] elif study_type == StudyType.TILE_DB.value: for i, o in enumerate(batch_info): o['name'] = o['folder'].split('/')[-1] else: for i, o in enumerate(batch_info): o['name'] = o['matrix'].split('/')[-1] data['batch_info'] = {f'Batch_{i}': o for i, o in enumerate(batch_info)} data['s3_id'] = s3_id submission_status = self.post_request( api_route='api/v1/submit_study_from_s3', data=data ) task_id = common.parse_submission_status(submission_status) if task_id is None: return False return self.get_submission_log( group_id=group_id, task_id=task_id, )
[docs] def submit_study_from_shared_s3( self, group_id: str, shared_s3_id: str, batch_info: List[dict] = [], study_id: str = None, name: str = 'TBD', authors: List[str] = [], abstract: str = '', species: str = Species.HUMAN.value, input_matrix_type: str = InputMatrixType.NORMALIZED.value, study_type: int = StudyType.H5AD.value, min_counts: int = None, min_genes: int = None, max_counts: int = None, max_genes: int = None, mt_percentage: Union[int, float] = None, skip_dimred: bool = False ): """ Submit one or multiple datasets from s3 bucket to BBrowserX. Parameters ---------- group_id : str ID of the group to submit the data to. shared_s3_id : str ID of s3 bucket. batch_info : List[dict] File path and batch name information, the path DOES NOT included the bucket path!\n Example: For h5ad format: [{ 'matrix': 's3_path/GSE128223_1.h5ad' }, {...}] For mtx format: [{ 'matrix': 's3_path/data_1/matrix.mtx',\n 'features': 's3_path/data_1/features.tsv',\n 'barcodes': 's3_path/data_1/barcodes.tsv', }, {...}] For tiledb format: [{ 'folder': 's3_path/GSE128223_1' }, {...}] study_id : str, optional Will be the displaying name of study (eg: PBMC_3K). Default: uuidv4 name : str, optional Name of the study. Default: 'TBD' authors : List[str], optional Authors of the study. Default: [] abstract : str, optional Abstract of the study. Default: '' species : bioturing_connector.typing.Species, optional Species of the study. Default: 'human'\n Support: bioturing_connector.typing.Species.HUMAN.value\n bioturing_connector.typing.Species.MOUSE.value\n bioturing_connector.typing.Species.NON_HUMAN_PRIMATE.value\n bioturing_connector.typing.Species.OTHERS.value\n skip_dimred : Bool, optional Skip BioTuring pipeline if set to True. Default: False\n (only applicable when input is a scanpy/seurat object). input_matrix_type : bioturing_connector.typing.InputMatrixType, optional Is the input matrix already normalized or not?. Default: 'normalized'\n Support: bioturing_connector.typing.InputMatrixType.NORMALIZED.value (will skip BioTuring normalization, h5ad: use adata.X) bioturing_connector.typing.InputMatrixType.RAW.value (apply BioTuring normalization, h5ad: use adata.raw.X) study_type : bioturing_connector.typing.StudyType, opitonal Format of the study. Default: bioturing_connector.typing.StudyType.H5AD.value\n Support: bioturing_connector.typing.StudyType.BBROWSER.value\n bioturing_connector.typing.StudyType.H5_10X.value\n bioturing_connector.typing.StudyType.H5AD.value\n bioturing_connector.typing.StudyType.MTX_10X.value\n bioturing_connector.typing.StudyType.BCS.value\n bioturing_connector.typing.StudyType.RDS.value\n bioturing_connector.typing.StudyType.TSV.value\n bioturing_connector.typing.StudyType.TILE_DB.value\n min_counts : int, optional Minimum number of counts required for a cell to pass filtering. Default: 0 min_genes : int, optional Minimum number of genes expressed required for a cell to pass filtering. Default: 0 max_counts : int, optional Maximum number of counts required for a cell to pass filtering. Default: inf max_genes : int, optional Maximum number of genes expressed required for a cell to pass filtering. Default: inf mt_percentage : int, optional Maximum number of mitochondria genes percentage required for a cell to pass filtering. Default: 100\n Ranging from 0 to 100\n Returns ---------- Submission status : bool | str True or Error log """ self._check_study_type(study_type) data = self._submit_study( group_id, study_id, name, authors, abstract, species, input_matrix_type, study_type, min_counts, min_genes, max_counts, max_genes, mt_percentage, skip_dimred, ) if study_type == StudyType.MTX_10X.value: for i, o in enumerate(batch_info): name = o['matrix'].split('/') if len(name) == 1: o['name'] = f'Batch {i + 1}' else: o['name'] = name[-2] elif study_type == StudyType.TILE_DB.value: for i, o in enumerate(batch_info): o['name'] = o['folder'].split('/')[-1] else: for i, o in enumerate(batch_info): o['name'] = o['matrix'].split('/')[-1] data['batch_info'] = {f'Batch_{i}': o for i, o in enumerate(batch_info)} data['shared_s3_id'] = shared_s3_id submission_status = self.post_request( api_route='api/v1/submit_study_from_shared_s3', data=data ) task_id = common.parse_submission_status(submission_status) if task_id is None: return False return self.get_submission_log( group_id=group_id, task_id=task_id, )
[docs] def submit_study_from_local( self, group_id: str, batch_info: object, study_id: str = None, name: str = 'TBD', authors: List[str] = [], abstract: str = '', species: str = Species.HUMAN.value, input_matrix_type: str = InputMatrixType.NORMALIZED.value, study_type: int = StudyType.H5AD.value, min_counts: int = None, min_genes: int = None, max_counts: int = None, max_genes: int = None, mt_percentage: Union[int, float] = None, skip_dimred: bool = False, chunk_size: int = ChunkSize.CHUNK_100_MB.value ): """ Submit one or multiple datasets from local / server. Parameters ---------- group_id : str ID of the group to submit the data to. batch_info : List[dict] File path and batch name information.\n Example: For h5ad format: [{ 'matrix': 'local_path/GSE128223_1.h5ad' }, {...}] For mtx format: [{ 'name': 'data_1',\n 'matrix': 'local_path/data_1/matrix.mtx',\n 'features': 'local_path/data_1/features.tsv',\n 'barcodes': 'local_path/data_1/barcodes.tsv', }, {...}] study_id : str, optional Will be the displaying name of study (eg: PBMC_3K). Default: uuidv4 name : str, optional Name of the study. Default: 'TBD' authors : List[str], optional Authors of the study. Default: [] abstract : str, optional Abstract of the study. Default: '' species : bioturing_connector.typing.Species, optional Species of the study. Default: 'human'\n Support: bioturing_connector.typing.Species.HUMAN.value\n bioturing_connector.typing.Species.MOUSE.value\n bioturing_connector.typing.Species.NON_HUMAN_PRIMATE.value\n bioturing_connector.typing.Species.OTHERS.value\n input_matrix_type : bioturing_connector.typing.InputMatrixType, optional Is the input matrix already normalized or not?. Default: 'normalized'\n Support: bioturing_connector.typing.InputMatrixType.NORMALIZED.value (will skip BioTuring normalization, h5ad: use adata.X) bioturing_connector.typing.InputMatrixType.RAW.value (apply BioTuring normalization, h5ad: use adata.raw.X) study_type : bioturing_connector.typing.StudyType, optional Format of the study. Default: bioturing_connector.typing.StudyType.H5AD.value\n Support: bioturing_connector.typing.StudyType.BBROWSER.value\n bioturing_connector.typing.StudyType.H5_10X.value\n bioturing_connector.typing.StudyType.H5AD.value\n bioturing_connector.typing.StudyType.MTX_10X.value\n bioturing_connector.typing.StudyType.BCS.value\n bioturing_connector.typing.StudyType.RDS.value\n bioturing_connector.typing.StudyType.TSV.value\n min_counts : int, optional Minimum number of counts required for a cell to pass filtering. Default: 0 min_genes : int, optional Minimum number of genes expressed required for a cell to pass filtering. Default: 0 max_counts : int, optional Maximum number of counts required for a cell to pass filtering. Default: inf max_genes : int, optional Maximum number of genes expressed required for a cell to pass filtering. Default: inf mt_percentage : int, optional Maximum number of mitochondria genes percentage required for a cell to pass filtering. Default: 100.\n Ranging from 0 to 100 skip_dimred : bool, optional Skip BioTuring pipeline if set to True (only appliable when input is a scanpy/seurat object). Default: False chunk_size : bioturing_connector.typing.ChunkSize, optional Size of each separated chunk for uploading. Default: 104857600\n Support: bioturing_connector.typing.ChunkSize.CHUNK_5_MB.value\n bioturing_connector.typing.ChunkSize.CHUNK_100_MB.value\n bioturing_connector.typing.ChunkSize.CHUNK_500_MB.value\n bioturing_connector.typing.ChunkSize.CHUNK_1_GB.value\n Returns ---------- Submission status : bool | str True or Error log """ if study_type == StudyType.TILE_DB.value: return 'Tile_db submission is only supported through s3 or shared s3' self._check_study_type(study_type) if chunk_size not in [e.value for e in ChunkSize]: return 'only support:\n{},\n{},\n{},\n{}'.format( 'ChunkSize.CHUNK_5_MB.value', 'ChunkSize.CHUNK_100_MB.value', 'ChunkSize.CHUNK_500_MB.value', 'ChunkSize.CHUNK_1_GB.value', ) file_names = [] files = [] if study_type == StudyType.MTX_10X.value: for o in batch_info: file_names.extend([ f'{o["name"]}matrix.mtx{".gz" if ".gz" in o["matrix"] else ""}', f'{o["name"]}features.tsv{".gz" if ".gz" in o["features"] else ""}', f'{o["name"]}barcodes.csv{".gz" if ".gz" in o["barcodes"] else ""}' ]) files.extend([ Path(o['matrix']), Path(o['features']), Path(o['barcodes']) ]) else: for o in batch_info: p = Path(o['matrix']) o['name'] = p.name file_names.append(p.name) files.append(p) output_dir = self.upload_chunk( file_names, files, chunk_size ) data = self._submit_study( group_id, study_id, name, authors, abstract, species, input_matrix_type, study_type, min_counts, min_genes, max_counts, max_genes, mt_percentage, skip_dimred, ) data['study_path'] = output_dir data['batch_info'] = [o['name'] for o in batch_info] submission_status = self.post_request( api_route='api/v1/submit_study_from_local', data=data, ) task_id = common.parse_submission_status(submission_status) if task_id is None: return False return self.get_submission_log( group_id=group_id, task_id=task_id, )