Source code for bioturing_connector.bbrowserx_connector

"""Python package for submitting/getting data from BBrowserX"""

from pathlib import Path

from typing import List
from typing import Union

from .common import common
from .common import get_uuid

from .typing import Species
from .typing import StudyType
from .typing import ChunkSize
from .typing import InputMatrixType

from .connector import Connector



[docs]
class BBrowserXConnector(Connector):
  """
  Create a connector object to submit/get data from BBrowserX

  Parameters
  ----------
  host : `str`
    The URL of the BBrowserX server, only support HTTPS connection\n
    Example:
      https://talk2data.bioturing.com/t2d_index_tool/
  token : `str`
    The API token to verify authority. Generated in-app.
  """


  def _check_study_type(self, study_type):
    if study_type not in [
      StudyType.BBROWSER.value,
      StudyType.H5_10X.value,
      StudyType.H5AD.value,
      StudyType.MTX_10X.value,
      StudyType.BCS.value,
      StudyType.RDS.value,
      StudyType.TSV.value,
      StudyType.TILE_DB.value,
    ]:
      raise Exception('Unsupported study_type, please recheck the imported connector \n(should be BBrowserXConnector for this study_type)')


  def _submit_study(
    self,
    group_id: str,
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    input_matrix_type: str = InputMatrixType.NORMALIZED.value,
    study_type: int = StudyType.H5AD.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None,
    mt_percentage: Union[int, float] = None,
    skip_dimred: bool = False,
  ):
    if study_id is None:
      study_id = get_uuid()

    if min_counts is None:
      min_counts = 0

    if min_genes is None:
      min_genes = 0

    if max_counts is None:
      max_counts = 1e9

    if max_genes is None:
      max_genes = 1e9

    if mt_percentage is None:
      mt_percentage = 100

    study_info = {
      'study_hash_id': study_id,
      'name': name,
      'authors': authors if authors else [],
      'abstract': abstract
    }

    return {
      'species': species,
      'group_id': group_id,
      'filter_params': {
        'min_counts': min_counts,
        'min_genes': min_genes,
        'max_counts': max_counts,
        'max_genes': max_genes,
        'mt_percentage': mt_percentage / 100
      },
      'study_type': study_type,
      'normalize': input_matrix_type == InputMatrixType.RAW.value,
      'subsample': -1,
      'skip_dimred': skip_dimred,
      'study_info': study_info,
    }



[docs]
  def submit_study_from_s3(
      self,
      group_id: str,
      s3_id: str = None,
      batch_info: List[dict] = [],
      study_id: str = None,
      name: str = 'TBD',
      authors: List[str] = [],
      abstract: str = '',
      species: str = Species.HUMAN.value,
      input_matrix_type: str = InputMatrixType.NORMALIZED.value,
      study_type: int = StudyType.H5AD.value,
      min_counts: int = None,
      min_genes: int = None,
      max_counts: int = None,
      max_genes: int = None,
      mt_percentage: Union[int, float] = None,
      skip_dimred: bool = False
    ):
    """
    Submit one or multiple datasets from s3 bucket to BBrowserX.

    Parameters
    ----------
    group_id : str
          ID of the group to submit the data to.
    s3_id : str, Optional
          ID of s3 bucket. Default: None\n
          If s3_id is not provided, we will use the first s3 bucket configured on the platform.
    batch_info : List[dict]
          File path and batch name information, the path DOES NOT included the bucket path!\n
          Example:
            For h5ad format:
              [{
                'matrix': 's3_path/GSE128223_1.h5ad'
              }, {...}]
            For mtx format:
              [{
                'matrix': 's3_path/data_1/matrix.mtx',\n
                'features': 's3_path/data_1/features.tsv',\n
                'barcodes': 's3_path/data_1/barcodes.tsv',
              }, {...}]
            For tiledb format:
              [{
                'folder': 's3_path/GSE128223_1'
              }, {...}]
    study_id : str, optional
          Will be the displaying name of study (eg: PBMC_3K). Default: uuidv4
    name : str, optional
          Name of the study. Default: 'TBD'
    authors : List[str], optional
          Authors of the study. Default: []
    abstract : str, optional
          Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
          Species of the study. Default: 'human'\n
          Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.NON_HUMAN_PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    skip_dimred : Bool, optional
          Skip BioTuring pipeline if set to True. Default: False\n
          (only applicable when input is a scanpy/seurat object).
    input_matrix_type : bioturing_connector.typing.InputMatrixType, optional
          Is the input matrix already normalized or not?. Default: 'normalized'\n
          Support:
                bioturing_connector.typing.InputMatrixType.NORMALIZED.value
                  (will skip BioTuring normalization, h5ad: use adata.X)
                bioturing_connector.typing.InputMatrixType.RAW.value
                  (apply BioTuring normalization, h5ad: use adata.raw.X)
    study_type : bioturing_connector.typing.StudyType, opitonal
          Format of the study. Default: bioturing_connector.typing.StudyType.H5AD.value\n
          Support:
                bioturing_connector.typing.StudyType.BBROWSER.value\n
                bioturing_connector.typing.StudyType.H5_10X.value\n
                bioturing_connector.typing.StudyType.H5AD.value\n
                bioturing_connector.typing.StudyType.MTX_10X.value\n
                bioturing_connector.typing.StudyType.BCS.value\n
                bioturing_connector.typing.StudyType.RDS.value\n
                bioturing_connector.typing.StudyType.TSV.value\n
                bioturing_connector.typing.StudyType.TILE_DB.value\n
    min_counts : int, optional
          Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
          Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
          Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
          Maximum number of genes expressed required for a cell to pass filtering. Default: inf
    mt_percentage : int, optional
          Maximum number of mitochondria genes percentage required for a cell to pass filtering. Default: 100\n
          Ranging from 0 to 100\n

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    self._check_study_type(study_type)
    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      input_matrix_type,
      study_type,
      min_counts,
      min_genes,
      max_counts,
      max_genes,
      mt_percentage,
      skip_dimred,
    )

    if study_type == StudyType.MTX_10X.value:
      for i, o in enumerate(batch_info):
        name = o['matrix'].split('/')
        if len(name) == 1:
          o['name'] = f'Batch {i + 1}'
        else:
          o['name'] = name[-2]
    elif study_type == StudyType.TILE_DB.value:
      for i, o in enumerate(batch_info):
        o['name'] = o['folder'].split('/')[-1]
    else:
      for i, o in enumerate(batch_info):
        o['name'] = o['matrix'].split('/')[-1]

    data['batch_info'] = {f'Batch_{i}': o for i, o in enumerate(batch_info)}
    data['s3_id'] = s3_id

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_s3',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id,
    )




[docs]
  def submit_study_from_shared_s3(
      self,
      group_id: str,
      shared_s3_id: str,
      batch_info: List[dict] = [],
      study_id: str = None,
      name: str = 'TBD',
      authors: List[str] = [],
      abstract: str = '',
      species: str = Species.HUMAN.value,
      input_matrix_type: str = InputMatrixType.NORMALIZED.value,
      study_type: int = StudyType.H5AD.value,
      min_counts: int = None,
      min_genes: int = None,
      max_counts: int = None,
      max_genes: int = None,
      mt_percentage: Union[int, float] = None,
      skip_dimred: bool = False
    ):
    """
    Submit one or multiple datasets from s3 bucket to BBrowserX.

    Parameters
    ----------
    group_id : str
          ID of the group to submit the data to.
    shared_s3_id : str
          ID of s3 bucket.
    batch_info : List[dict]
          File path and batch name information, the path DOES NOT included the bucket path!\n
          Example:
            For h5ad format:
              [{
                'matrix': 's3_path/GSE128223_1.h5ad'
              }, {...}]
            For mtx format:
              [{
                'matrix': 's3_path/data_1/matrix.mtx',\n
                'features': 's3_path/data_1/features.tsv',\n
                'barcodes': 's3_path/data_1/barcodes.tsv',
              }, {...}]
            For tiledb format:
              [{
                'folder': 's3_path/GSE128223_1'
              }, {...}]
    study_id : str, optional
          Will be the displaying name of study (eg: PBMC_3K). Default: uuidv4
    name : str, optional
          Name of the study. Default: 'TBD'
    authors : List[str], optional
          Authors of the study. Default: []
    abstract : str, optional
          Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
          Species of the study. Default: 'human'\n
          Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.NON_HUMAN_PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    skip_dimred : Bool, optional
          Skip BioTuring pipeline if set to True. Default: False\n
          (only applicable when input is a scanpy/seurat object).
    input_matrix_type : bioturing_connector.typing.InputMatrixType, optional
          Is the input matrix already normalized or not?. Default: 'normalized'\n
          Support:
                bioturing_connector.typing.InputMatrixType.NORMALIZED.value
                  (will skip BioTuring normalization, h5ad: use adata.X)
                bioturing_connector.typing.InputMatrixType.RAW.value
                  (apply BioTuring normalization, h5ad: use adata.raw.X)
    study_type : bioturing_connector.typing.StudyType, opitonal
          Format of the study. Default: bioturing_connector.typing.StudyType.H5AD.value\n
          Support:
                bioturing_connector.typing.StudyType.BBROWSER.value\n
                bioturing_connector.typing.StudyType.H5_10X.value\n
                bioturing_connector.typing.StudyType.H5AD.value\n
                bioturing_connector.typing.StudyType.MTX_10X.value\n
                bioturing_connector.typing.StudyType.BCS.value\n
                bioturing_connector.typing.StudyType.RDS.value\n
                bioturing_connector.typing.StudyType.TSV.value\n
                bioturing_connector.typing.StudyType.TILE_DB.value\n
    min_counts : int, optional
          Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
          Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
          Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
          Maximum number of genes expressed required for a cell to pass filtering. Default: inf
    mt_percentage : int, optional
          Maximum number of mitochondria genes percentage required for a cell to pass filtering. Default: 100\n
          Ranging from 0 to 100\n

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    self._check_study_type(study_type)
    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      input_matrix_type,
      study_type,
      min_counts,
      min_genes,
      max_counts,
      max_genes,
      mt_percentage,
      skip_dimred,
    )

    if study_type == StudyType.MTX_10X.value:
      for i, o in enumerate(batch_info):
        name = o['matrix'].split('/')
        if len(name) == 1:
          o['name'] = f'Batch {i + 1}'
        else:
          o['name'] = name[-2]
    elif study_type == StudyType.TILE_DB.value:
      for i, o in enumerate(batch_info):
        o['name'] = o['folder'].split('/')[-1]
    else:
      for i, o in enumerate(batch_info):
        o['name'] = o['matrix'].split('/')[-1]

    data['batch_info'] = {f'Batch_{i}': o for i, o in enumerate(batch_info)}
    data['shared_s3_id'] = shared_s3_id

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_shared_s3',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id,
    )




[docs]
  def submit_study_from_local(
    self,
    group_id: str,
    batch_info: object,
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    input_matrix_type: str = InputMatrixType.NORMALIZED.value,
    study_type: int = StudyType.H5AD.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None,
    mt_percentage: Union[int, float] = None,
    skip_dimred: bool = False,
    chunk_size: int = ChunkSize.CHUNK_100_MB.value
  ):
    """
    Submit one or multiple datasets from local / server.

    Parameters
    ----------
    group_id : str
          ID of the group to submit the data to.
    batch_info : List[dict]
          File path and batch name information.\n
          Example:
            For h5ad format:
              [{
                'matrix': 'local_path/GSE128223_1.h5ad'
              }, {...}]
            For mtx format:
              [{
                'name': 'data_1',\n
                'matrix': 'local_path/data_1/matrix.mtx',\n
                'features': 'local_path/data_1/features.tsv',\n
                'barcodes': 'local_path/data_1/barcodes.tsv',
              }, {...}]
    study_id : str, optional
          Will be the displaying name of study (eg: PBMC_3K). Default: uuidv4
    name : str, optional
          Name of the study. Default: 'TBD'
    authors : List[str], optional
          Authors of the study. Default: []
    abstract : str, optional
          Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
          Species of the study. Default: 'human'\n
          Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.NON_HUMAN_PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    input_matrix_type : bioturing_connector.typing.InputMatrixType, optional
          Is the input matrix already normalized or not?. Default: 'normalized'\n
          Support:
              bioturing_connector.typing.InputMatrixType.NORMALIZED.value
                (will skip BioTuring normalization, h5ad: use adata.X)
              bioturing_connector.typing.InputMatrixType.RAW.value
                (apply BioTuring normalization, h5ad: use adata.raw.X)
    study_type : bioturing_connector.typing.StudyType, optional
          Format of the study. Default: bioturing_connector.typing.StudyType.H5AD.value\n
          Support:
                bioturing_connector.typing.StudyType.BBROWSER.value\n
                bioturing_connector.typing.StudyType.H5_10X.value\n
                bioturing_connector.typing.StudyType.H5AD.value\n
                bioturing_connector.typing.StudyType.MTX_10X.value\n
                bioturing_connector.typing.StudyType.BCS.value\n
                bioturing_connector.typing.StudyType.RDS.value\n
                bioturing_connector.typing.StudyType.TSV.value\n
    min_counts : int, optional
          Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
          Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
          Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
          Maximum number of genes expressed required for a cell to pass filtering. Default: inf
    mt_percentage : int, optional
          Maximum number of mitochondria genes percentage required for a cell to pass filtering. Default: 100.\n
          Ranging from 0 to 100
    skip_dimred : bool, optional
          Skip BioTuring pipeline if set to True (only appliable when input is a scanpy/seurat object). Default: False
    chunk_size : bioturing_connector.typing.ChunkSize, optional
          Size of each separated chunk for uploading. Default: 104857600\n
          Support:
                bioturing_connector.typing.ChunkSize.CHUNK_5_MB.value\n
                bioturing_connector.typing.ChunkSize.CHUNK_100_MB.value\n
                bioturing_connector.typing.ChunkSize.CHUNK_500_MB.value\n
                bioturing_connector.typing.ChunkSize.CHUNK_1_GB.value\n

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    if study_type == StudyType.TILE_DB.value:
      return 'Tile_db submission is only supported through s3 or shared s3'

    self._check_study_type(study_type)

    if chunk_size not in [e.value for e in ChunkSize]:
      return 'only support:\n{},\n{},\n{},\n{}'.format(
        'ChunkSize.CHUNK_5_MB.value',
        'ChunkSize.CHUNK_100_MB.value',
        'ChunkSize.CHUNK_500_MB.value',
        'ChunkSize.CHUNK_1_GB.value',
      )

    file_names = []
    files = []
    if study_type == StudyType.MTX_10X.value:
      for o in batch_info:
        file_names.extend([
          f'{o["name"]}matrix.mtx{".gz" if ".gz" in o["matrix"] else ""}',
          f'{o["name"]}features.tsv{".gz" if ".gz" in o["features"] else ""}',
          f'{o["name"]}barcodes.csv{".gz" if ".gz" in o["barcodes"] else ""}'
        ])
        files.extend([
          Path(o['matrix']),
          Path(o['features']),
          Path(o['barcodes'])
        ])
    else:
      for o in batch_info:
        p = Path(o['matrix'])
        o['name'] = p.name
        file_names.append(p.name)
        files.append(p)


    output_dir = self.upload_chunk(
      file_names,
      files,
      chunk_size
    )
    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      input_matrix_type,
      study_type,
      min_counts,
      min_genes,
      max_counts,
      max_genes,
      mt_percentage,
      skip_dimred,
    )
    data['study_path'] = output_dir
    data['batch_info'] = [o['name'] for o in batch_info]

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_local',
      data=data,
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id,
    )