Source code for bioturing_connector.lens_sc_connector

"""Python package for submitting/getting data from Lens SC"""

import os
import numpy as np

from typing import List
from typing import Union
from pathlib import Path

from .common import get_uuid
from .common import common

from .typing import Species
from .typing import StudyType
from .typing import ChunkSize
from .typing import TechnologyType
from .typing import INPUT_LENS_SC

from .connector import Connector



[docs]
class LensSCConnector(Connector):
  """
  Create a connector object to submit/get data from BioTuring Lens Single-cell (Xenium/Cosmx/Vizgen/Proteomics)

  Parameters
  ----------
  host : str
    The URL of the LENS SC server, only support HTTPS connection\n
    Example:
      https://talk2data.bioturing.com/lens_sc/
  token : str
    The API token to verify authority. Generated in-app.
  """

  def _check_lens_sc_study_type(self, study_type):
    if study_type not in [
      StudyType.VIZGEN.value,
      StudyType.COSMX.value,
      StudyType.XENIUM.value,
    ]:
      raise Exception('Unsupported study_type, please recheck the imported connector \n(should be LensSCConnector for this study_type)')


  def _check_vizgen_version_2(self, study_type, all_input_files):
    if study_type != StudyType.VIZGEN.value:
      return study_type
    for i in all_input_files:
      if i.lower().endswith('.parquet'):
        return StudyType.VIZGEN_V2.value
    return StudyType.VIZGEN.value


  def _check_valid_input_lens_sc(
      self,
      server_files_path,
      server_folders_path,
      study_type,
    ):
    input_files = INPUT_LENS_SC[study_type]['files'].values()
    input_folders = INPUT_LENS_SC[study_type]['folders'].values()
    file_paths = []
    folder_paths = []
    for f in input_files:
      try:
        file_path = [x for x in server_files_path if x.lower().endswith(f)][0]
        file_paths.append(file_path.split('/')[-1])
      except Exception as e:
        raise ValueError('Cannot find ***{} in selected folder. Error: {}'.format(f, e))
    for f in input_folders:
      try:
        folder_path = [x for x in server_folders_path if x.lower().endswith(f)][0]
        folder_paths.append(folder_path.split('/')[-1])
      except Exception as e:
        raise ValueError('Cannot find {} in selected folder. Error: {}'.format(f, e))
    return file_paths, folder_paths


  def _get_required_files_fols_lens_sc(self, server_dir_path, study_type):
    all_files_fols = [
      os.path.join(server_dir_path, x)
      for x in os.listdir(server_dir_path)
    ]
    all_files = [
      x for x in all_files_fols if os.path.isfile(x)
    ]
    all_folders = [
      x for x in all_files_fols if os.path.isdir(x)
    ]
    study_type = self._check_vizgen_version_2(study_type, all_files_fols)
    return self._check_valid_input_lens_sc(all_files, all_folders, study_type)


  def _upload_fol_lens_sc(self, batch_info, study_type, chunk_size):
    final_file_names = []
    final_files = []
    try:
      for batch in batch_info:
        files_path, fols_path = self._get_required_files_fols_lens_sc(
          batch['folder'],
          study_type
        )
        zip_path = os.path.join(
          batch['folder'], '{}.zip'.format(batch['name'])
        )

        print ('Zipping neccesary files of batch [{}]. \nLocation: {}'.format(batch['name'], zip_path))
        final_file_names.append('{}.zip'.format(batch['name']))
        final_files.append(zip_path)
        os.system('cd {} && zip -r {} {} && cd -'.format(
          batch['folder'],
          zip_path,
          ' '.join(files_path + fols_path)
        ))

      print ('Uploading all files to server...')
      output_dir = self.upload_chunk(
        final_file_names,
        [Path(zip_path)],
        chunk_size
      )

    except Exception as e:
      raise e

    finally:
      for zip_path in final_files:
        print ('Delete zip files: [{}]'.format(zip_path))
        os.system('rm {}'.format(zip_path))

    return output_dir


  def _submit_study(
    self,
    group_id: str,
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    study_type: int = StudyType.XENIUM.value,
    technology_type: str = TechnologyType.LENS_SC.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None,
    neg_controls_percentage: Union[int, float] = None,
  ):
    if study_id is None:
      study_id = get_uuid()

    if min_counts is None:
      min_counts = 0

    if min_genes is None:
      min_genes = 0

    if max_counts is None:
      max_counts = 1e9

    if max_genes is None:
      max_genes = 1e9

    if neg_controls_percentage is None:
      neg_controls_percentage = 100

    study_info = {
      'study_hash_id': study_id,
      'name': name,
      'authors': authors if authors else [],
      'abstract': abstract
    }

    return {
      'species': species,
      'group_id': group_id,
      'filter_params': {
        'min_counts': min_counts,
        'min_genes': min_genes,
        'max_counts': max_counts,
        'max_genes': max_genes,
        'neg_controls_percentage': neg_controls_percentage,
      },
      'study_type': technology_type,
      'platform': study_type,
      'study_info': study_info,
    }



[docs]
  def submit_study_from_s3_lens_sc(
    self,
    group_id: str,
    s3_id: str = None,
    batch_info: List[dict] = [],
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    study_type: int = StudyType.XENIUM.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None,
    neg_controls_percentage: Union[int, float] = None,
  ):
    """
    Submit multiple single cell - spatial folders.

    Parameters
    ----------
    group_id : str
        ID of the group to submit the data to.
    s3_id : str, Optional
        ID of s3 bucket. Default: None\n
        If s3_id is not provided, we will use the first s3 bucket configured on the platform.
    batch_info : List[dict]
        File path and batch name information, the path DOES NOT include the bucket path configured on platform!\n
        Example:
          [{
            'name': 'study_1',\n
            'folder': 's3_path/study_folder',
          }, {...}]
    study_id : str, optional
        Will be the displaying name of study (eg: COSMX_BRAIN). Default: uuidv4\n
    name : str, optional
        Name of the study. Default: 'TBD'
    authors : List[str], optional
        Authors of the study. Default: []
    abstract : str, optional
        Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
        Species of the study. Default: 'human'.\n
        Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    study_type : bioturing_connector.typing.StudyType, optional
        Format of the study. Default: bioturing_connector.typing.StudyType.XENIUM.value.\n
        Support:
                bioturing_connector.typing.StudyType.VIZGEN.value\n
                bioturing_connector.typing.StudyType.COSMX.value\n
                bioturing_connector.typing.StudyType.XENIUM.value\n
    min_counts : int, optional
        Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
        Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
        Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
        Maximum number of genes expressed required for a cell to pass filtering. Default: inf
    neg_controls_percentage : int, optional
        Maximum number of control/negative genes percentage required for a cell to pass filtering. Default: 100\n
        Ranging from 0 to 100

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    self._check_lens_sc_study_type(study_type)
    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      study_type,
      TechnologyType.LENS_SC.value,
      min_counts,
      min_genes,
      max_counts,
      max_genes,
      neg_controls_percentage,
    )
    data['batch_info'] = {o['name']: o for o in batch_info}
    data['s3_id'] = s3_id

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_s3',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id
    )




[docs]
  def submit_study_from_s3_proteomics(
    self,
    group_id: str,
    s3_id: str = None,
    batch_info: dict = dict(),
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None
  ):
    """
    Submit one Proteomics image.

    Parameters
    ----------
    group_id : str
        ID of the group to submit the data to.
    s3_id : str, Optional
        ID of s3 bucket. Default: None\n
        If s3_id is not provided, we will use the first s3 bucket configured on the platform.
    batch_info : Dict[]
        File path and batch name information, the path DOES NOT included the bucket path!\n
        Example:
          {
            'image': 's3_path/image.ome.tiff'
          }
    study_id : str, optional
        Will be the displaying name of study (eg: CODEX_BRAIN). Default: uuidv4
    name : str, optional
        Name of the study. Default: 'TBD'
    authors : List[str], optional
        Authors of the study. Default: []
    abstract : str, optional
        Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
        Species of the study. Default: 'human'.\n
        Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    min_counts : int, optional
        Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
        Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
        Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
        Maximum number of genes expressed required for a cell to pass filtering. Default: inf

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      -1,
      TechnologyType.PROTEOMICS.value,
      min_counts,
      min_genes,
      max_counts,
      max_genes
    )
    batch_info_name = batch_info['image'].split('/')[-1]
    data['batch_info'] = {batch_info_name: {'name': batch_info_name, 'image': batch_info['image']}}
    data['s3_id'] = s3_id

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_s3',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id
    )




[docs]
  def submit_study_from_shared_s3_lens_sc(
    self,
    group_id: str,
    shared_s3_id: str,
    batch_info: List[dict] = [],
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    study_type: int = StudyType.XENIUM.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None,
    neg_controls_percentage: Union[int, float] = None,
  ):
    """
    Submit multiple single cell - spatial folders.

    Parameters
    ----------
    group_id : str
        ID of the group to submit the data to.
    shared_s3_id : str
        ID of s3 bucket.
    batch_info : List[dict]
        File path and batch name information, the path DOES NOT include the bucket path configured on platform!\n
        Example:
          [{
            'name': 'study_1',\n
            'folder': 's3_path/study_folder',
          }, {...}]
    study_id : str, optional
        Will be the displaying name of study (eg: COSMX_BRAIN). Default: uuidv4\n
    name : str, optional
        Name of the study. Default: 'TBD'
    authors : List[str], optional
        Authors of the study. Default: []
    abstract : str, optional
        Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
        Species of the study. Default: 'human'.\n
        Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    study_type : bioturing_connector.typing.StudyType, optional
        Format of the study. Default: bioturing_connector.typing.StudyType.XENIUM.value.\n
        Support:
                bioturing_connector.typing.StudyType.VIZGEN.value\n
                bioturing_connector.typing.StudyType.COSMX.value\n
                bioturing_connector.typing.StudyType.XENIUM.value\n
    min_counts : int, optional
        Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
        Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
        Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
        Maximum number of genes expressed required for a cell to pass filtering. Default: inf
    neg_controls_percentage : int, optional
        Maximum number of control/negative genes percentage required for a cell to pass filtering. Default: 100\n
        Ranging from 0 to 100

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    self._check_lens_sc_study_type(study_type)
    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      study_type,
      TechnologyType.LENS_SC.value,
      min_counts,
      min_genes,
      max_counts,
      max_genes,
      neg_controls_percentage,
    )
    data['batch_info'] = {o['name']: o for o in batch_info}
    data['shared_s3_id'] = shared_s3_id

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_shared_s3',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id
    )




[docs]
  def submit_study_from_shared_s3_proteomics(
    self,
    group_id: str,
    shared_s3_id: str = None,
    batch_info: dict = dict(),
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None
  ):
    """
    Submit one Proteomics image.

    Parameters
    ----------
    group_id : str
        ID of the group to submit the data to.
    shared_s3_id : str, Optional
        ID of s3 bucket
    batch_info : Dict[]
        File path and batch name information, the path DOES NOT included the bucket path!\n
        Example:
          {
            'image': 's3_path/image.ome.tiff'
          }
    study_id : str, optional
        Will be the displaying name of study (eg: CODEX_BRAIN). Default: uuidv4
    name : str, optional
        Name of the study. Default: 'TBD'
    authors : List[str], optional
        Authors of the study. Default: []
    abstract : str, optional
        Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
        Species of the study. Default: 'human'.\n
        Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    min_counts : int, optional
        Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
        Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
        Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
        Maximum number of genes expressed required for a cell to pass filtering. Default: inf

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      -1,
      TechnologyType.PROTEOMICS.value,
      min_counts,
      min_genes,
      max_counts,
      max_genes
    )
    batch_info_name = batch_info['image'].split('/')[-1]
    data['batch_info'] = {batch_info_name: {'name': batch_info_name, 'image': batch_info['image']}}
    data['shared_s3_id'] = shared_s3_id

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_shared_s3',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id
    )




[docs]
  def submit_study_from_local_lens_sc(
    self,
    group_id: str,
    batch_info: List[dict],
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    study_type: int = StudyType.XENIUM.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None,
    neg_controls_percentage: Union[int, float] = None,
    chunk_size: int = ChunkSize.CHUNK_100_MB.value,
  ):
    """
    Submit multiple single cell - spatial folders.

    Parameters
    ----------
    group_id : str
        ID of the group to submit the data to.
    batch_info : List[dict]
        File path and batch name information\n
        Example:
          [{
            'name': 'dataset_1',\n
            'folder': 'server_path/dataset_folder_1',
          }, {...}]
    study_id : str, optional
        Will be the displaying name of study (eg: COSMX_BRAIN). Default: uuidv4
    name : str, optional
        Name of the study. Default: 'TBD'
    authors : List[str], optional
        Authors of the study. Default: []
    abstract : str, optional
        Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
        Species of the study. Default: 'human'.\n
        Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    study_type : bioturing_connector.typing.StudyType, optional
        Format of the study. Default: bioturing_connector.typing.StudyType.XENIUM.value\n
        Support:
                bioturing_connector.typing.StudyType.VIZGEN.value\n
                bioturing_connector.typing.StudyType.COSMX.value\n
                bioturing_connector.typing.StudyType.XENIUM.value\n
    min_counts : int, optional
        Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
        Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
        Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
        Maximum number of genes expressed required for a cell to pass filtering. Default: inf
    neg_controls_percentage : int, optional
        Maximum number of control/negative genes percentage required for a cell to pass filtering. Default: 100\n
        Ranging from 0 to 100
    chunk_size : bioturing_connector.typing.ChunkSize, optional
        Size of each separated chunk for uploading. Default: 104857600\n
        Support:
                bioturing_connector.typing.ChunkSize.CHUNK_5_MB.value
                bioturing_connector.typing.ChunkSize.CHUNK_100_MB.value
                bioturing_connector.typing.ChunkSize.CHUNK_500_MB.value
                bioturing_connector.typing.ChunkSize.CHUNK_1_GB.value

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """

    self._check_lens_sc_study_type(study_type)

    if len(np.unique([x['name'] for x in batch_info])) != len(batch_info):
      raise Exception('Names of batches must be unique')

    if chunk_size not in [e.value for e in ChunkSize]:
      return 'only support:\n{},\n{},\n{},\n{}'.format(
        'ChunkSize.CHUNK_5_MB.value',
        'ChunkSize.CHUNK_100_MB.value',
        'ChunkSize.CHUNK_500_MB.value',
        'ChunkSize.CHUNK_1_GB.value',
      )

    output_dir = self._upload_fol_lens_sc(batch_info, study_type, chunk_size)

    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      study_type,
      TechnologyType.LENS_SC.value,
      min_counts,
      min_genes,
      max_counts,
      max_genes,
      neg_controls_percentage,
    )
    data['study_path'] = output_dir
    data['batch_info'] = [o['name'] for o in batch_info]

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_local',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id
    )




[docs]
  def submit_study_from_local_proteomics(
    self,
    group_id: str,
    batch_info: dict,
    study_id: str = None,
    name: str = 'TBD',
    authors: List[str] = [],
    abstract: str = '',
    species: str = Species.HUMAN.value,
    min_counts: int = None,
    min_genes: int = None,
    max_counts: int = None,
    max_genes: int = None,
    chunk_size: int = ChunkSize.CHUNK_100_MB.value,
  ):
    """
    Submit one Proteomics image.

    Parameters
    ----------
    group_id : str
        ID of the group to submit the data to.
    batch_info : List[]
        File path and batch name information\n
        Example:
          {
            'image': 'server_path/image.ome.tiff'
          }
    study_id : str, optional
        Will be the displaying name of study (eg: CODEX_BRAIN). Default: uuidv4
    name : str, optional
        Name of the study. Default: 'TBD'
    authors : List[str], optional
        Authors of the study. Default: []
    abstract : str, optional
        Abstract of the study. Default: ''
    species : bioturing_connector.typing.Species, optional
        Species of the study. Default: 'human'\n
        Support:
                bioturing_connector.typing.Species.HUMAN.value\n
                bioturing_connector.typing.Species.MOUSE.value\n
                bioturing_connector.typing.Species.PRIMATE.value\n
                bioturing_connector.typing.Species.OTHERS.value\n
    min_counts : int, optional
        Minimum number of counts required for a cell to pass filtering. Default: 0
    min_genes : int, optional
        Minimum number of genes expressed required for a cell to pass filtering. Default: 0
    max_counts : int, optional
        Maximum number of counts required for a cell to pass filtering. Default: inf
    max_genes : int, optional
        Maximum number of genes expressed required for a cell to pass filtering. Default: inf
    chunk_size : bioturing_connector.typing.ChunkSize, optional
        Size of each separated chunk for uploading. Default: 104857600.\n
        Support:
              bioturing_connector.typing.ChunkSize.CHUNK_5_MB.value\n
              bioturing_connector.typing.ChunkSize.CHUNK_100_MB.value\n
              bioturing_connector.typing.ChunkSize.CHUNK_500_MB.value\n
              bioturing_connector.typing.ChunkSize.CHUNK_1_GB.value\n

    Returns
    ----------
    Submission status : bool | str
      True or Error log
    """
    study_type = -1
    batch_info_name = batch_info['image'].split('/')[-1]
    file_names = [batch_info_name]
    files = [batch_info['image']]
    output_dir = self.upload_chunk(
      file_names,
      [Path(x) for x in files],
      chunk_size
    )

    data = self._submit_study(
      group_id,
      study_id,
      name,
      authors,
      abstract,
      species,
      study_type,
      TechnologyType.PROTEOMICS.value,
      min_counts,
      min_genes,
      max_counts,
      max_genes,
    )
    data['study_path'] = output_dir
    data['batch_info'] = file_names

    submission_status = self.post_request(
      api_route='api/v1/submit_study_from_local',
      data=data
    )

    task_id = common.parse_submission_status(submission_status)
    if task_id is None:
      return False

    return self.get_submission_log(
      group_id=group_id,
      task_id=task_id
    )