{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ed56c80a-2066-42ec-a199-3576aae50968",
   "metadata": {},
   "source": [
    "# SDK_Document_BBrowserX"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95aea447-4b8c-43d0-a00b-84cc3cbe7a8e",
   "metadata": {},
   "source": [
    "## Installation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1db9dd1-0d5e-48d7-b4ab-0934a2bad2cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python3 -m pip install -U bioturing_connector"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "deb8a6d8-dbbe-49bd-854d-0172eb84928a",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 1. Connect to host server"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df88f7b3-de4a-4e6a-8730-c6275c260ce4",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-info\"> <b>Must run this step before any further analyses</b> </div>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09b17bd8-c0d1-4f9c-b7d2-620774f2c6e4",
   "metadata": {},
   "source": [
    "User's token is generated from host website"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "25c56829-e579-4354-8acf-3ce658c36212",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from bioturing_connector.typing import Species\n",
    "from bioturing_connector.typing import ChunkSize\n",
    "from bioturing_connector.typing import StudyType\n",
    "from bioturing_connector.typing import StudyUnit\n",
    "from bioturing_connector.typing import InputMatrixType\n",
    "from bioturing_connector.bbrowserx_connector import BBrowserXConnector\n",
    "\n",
    "connector = BBrowserXConnector(\n",
    "  host=\"https://talk2data.bioturing.com/t2d_index_tool/\",\n",
    "  token=\"98592aac0b284c899ebf5dd0ff2eff90\",\n",
    "  ssl=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "cb203d7a-3fee-4bce-9bb1-1df74c3b29c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Connecting to host at https://talk2data.bioturing.com/t2d_index_tool/api/v1/test_connection\n",
      "Connection successful\n"
     ]
    }
   ],
   "source": [
    "connector.test_connection()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8789beab-c2bc-4c1c-a474-63f75f0ebf11",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 2. List groups, studies and s3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13ee4cb3-ee8d-4691-9078-81c5b6e5af27",
   "metadata": {},
   "source": [
    "### 2.1. Get info of available groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dcdc00de-3e39-43a9-89d2-3d82a336129e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'group_id': 'all_members', 'group_name': 'All members'},\n",
       " {'group_id': 'bioturing_public_studies',\n",
       "  'group_name': 'BioTuring Public Studies'},\n",
       " {'group_id': 'personal', 'group_name': 'Personal workspace'}]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_groups = connector.get_user_groups()\n",
    "user_groups"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdd6ea7e-918f-4900-a575-e7de250d1fa3",
   "metadata": {},
   "source": [
    "### 2.2. List all available studies in a group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fe08fe63-b90b-4e8f-a06b-5b27b3e55fef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'uuid': '80d76fc8136c4dfe807e3aa2beefca76',\n",
       "  'study_title': 'TBD',\n",
       "  'study_hash_id': 'COSMX_HUMAN_CORTEX',\n",
       "  'created_by': 'sonvo@bioturing.com'},\n",
       " {'uuid': 'a1558f8ed6064095be86a091a4118c4a',\n",
       "  'study_title': 'TBD',\n",
       "  'study_hash_id': 'GSE128223',\n",
       "  'created_by': 'sonvo@bioturing.com'}]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Using group_id from step 2.1\n",
    "\n",
    "study_list = connector.get_all_studies_info_in_group(\n",
    "  group_id='personal',\n",
    "  species=Species.HUMAN.value,\n",
    ")\n",
    "study_list"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f4d115a",
   "metadata": {},
   "source": [
    "### 2.3. List all s3 bucket of current user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1071dc73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'id': '505e49d2abee405f8a7b4ce2628d5270',\n",
       "  'bucket': 'bioturingdebug',\n",
       "  'prefix': ''},\n",
       " {'id': 'd938706094354d7eb4726d6c9b07de9c',\n",
       "  'bucket': 'talk2data',\n",
       "  'prefix': ''}]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "connector.get_user_s3()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "574d36b2",
   "metadata": {},
   "source": [
    "### 2.4. List all shared s3 of a group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4dec7471",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "connector.get_shared_s3_of_group('all_members')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b34a16e5-feff-457e-8a67-d9619a2c668a",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 3. Submit study"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4f5719b-9303-4939-bc4f-1b027be36d10",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-success\">NOTE: Get <b>group_id</b> from step <b>\"2.1. Get info of available groups\"</b></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "adbb4166-03c2-4423-b1ff-77cae7e7bf04",
   "metadata": {},
   "source": [
    "### 3.1. Option 1: Submit study from s3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d14334c-787f-410a-8eaa-d9960b7fb05c",
   "metadata": {},
   "source": [
    "```\n",
    "Parameters:\n",
    "----\n",
    "group_id: str\n",
    "      ID of the group to submit the data to.\n",
    "s3_id: str\n",
    "      ID of s3 bucket. Default: None\\n\n",
    "      If s3_id is not provided, we will use the first s3 bucket configured on the platform.\n",
    "batch_info: List[dict]\n",
    "      File path and batch name information, the path DOES NOT include bucket path configured on platform!\n",
    "      Example:\n",
    "        For H5AD format:\n",
    "          [{\n",
    "            'matrix': 's3_path/GSE128223_1.h5ad'\n",
    "          }, {...}]\n",
    "        For RDS format:\n",
    "          [{\n",
    "            'matrix': 's3_path/GSE128223_1.rds'\n",
    "          }, {...}]\n",
    "        For MTX_10X format:\n",
    "          [{\n",
    "            'matrix': 's3_path/data_1/matrix.mtx',\n",
    "            'features': 's3_path/data_1/features.tsv',\n",
    "            'barcodes': 's3_path/data_1/barcodes.tsv',\n",
    "          }, {...}]\n",
    "        For TILE_DB format:\n",
    "          [{\n",
    "            'folder': 's3_path/GSE128223_1'\n",
    "          }, {...}]\n",
    "study_id: str\n",
    "      Will be name of study (eg: GSE128223)\n",
    "      If no value is provided, default id will be a random uuidv4 string\n",
    "name: str\n",
    "      Name of the study.\n",
    "authors: List[str]\n",
    "      Authors of the study.\n",
    "abstract: str\n",
    "      Abstract of the study.\n",
    "species: str\n",
    "      Species of the study.\n",
    "      Support:\n",
    "            Species.HUMAN.value\n",
    "            Species.MOUSE.value\n",
    "            Species.NON_HUMAN_PRIMATE.value\n",
    "            Species.OTHERS.value\n",
    "skip_dimred: Bool\n",
    "      Skip BioTuring pipeline if set to True (only appliable when input is a scanpy/seurat object).\n",
    "input_matrix_type: str\n",
    "      Is the input matrix already normalized or not?\n",
    "      Support:\n",
    "            InputMatrixType.NORMALIZED.value (will skip BioTuring normalization, h5ad: use adata.X)\n",
    "            InputMatrixType.RAW.value (apply BioTuring normalization, h5ad: use adata.raw.X)\n",
    "study_type: int\n",
    "      Format of dataset\n",
    "      Support:\n",
    "            StudyType.BBROWSER.value\n",
    "            StudyType.H5_10X.value\n",
    "            StudyType.H5AD.value\n",
    "            StudyType.MTX_10X.value\n",
    "            StudyType.BCS.value\n",
    "            StudyType.RDS.value\n",
    "            StudyType.TSV.value\n",
    "            StudyType.TILE_DB.value\n",
    "min_counts: int\n",
    "      Minimum number of counts required for a cell to pass filtering.\n",
    "min_genes: int\n",
    "      Minimum number of genes expressed required for a cell to pass filtering.\n",
    "max_counts: int\n",
    "      Maximum number of counts required for a cell to pass filtering.\n",
    "max_genes:\n",
    "      Maximum number of genes expressed required for a cell to pass filtering.\n",
    "mt_percentage: int\n",
    "      Maximum number of mitochondria genes percentage required for a cell to pass filtering.\n",
    "      Ranging from 0 to 100\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca8dd719",
   "metadata": {},
   "source": [
    "#### 3.1.1. 10X Matrix format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "7d47fc3e-295a-4757-a6e4-79e8fa213381",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2023-09-26 06:08] Waiting in queue\n",
      "[2023-09-26 06:08] Downloading GSE128223/raw/barcodes.tsv from s3: 262.1 KB / 539.5 KB\n",
      "[2023-09-26 06:08] Downloading GSE128223/raw/features.tsv from s3: 262.1 KB / 322.8 KB\n",
      "[2023-09-26 06:08] Downloading GSE128223/raw/matrix.mtx from s3: 262.1 KB / 927.0 MB\n",
      "[2023-09-26 06:09] File downloaded\n",
      "[2023-09-26 06:09] Reading batch: raw\n",
      "[2023-09-26 06:09] Preprocessing expression matrix: 20923 cells x 35756 genes\n",
      "[2023-09-26 06:09] Filtered: 20923 cells remain\n",
      "[2023-09-26 06:09] Start processing study\n",
      "[2023-09-26 06:09] Normalizing expression matrix\n",
      "[2023-09-26 06:09] Running PCA\n",
      "[2023-09-26 06:09] Running kNN\n",
      "[2023-09-26 06:09] Running venice binarizer\n",
      "[2023-09-26 06:09] Running t-SNE\n",
      "[2023-09-26 06:09] Study was successfully submitted\n",
      "[2023-09-26 06:09] DONE!!!\n",
      "Study submitted successfully!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 'GSE128223/raw/matrix.mtx',\n",
    "    'features': 'GSE128223/raw/features.tsv',\n",
    "    'barcodes': 'GSE128223/raw/barcodes.tsv',\n",
    "}, {...}]\n",
    "connector.submit_study_from_s3(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.MTX_10X.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "618b3cc4",
   "metadata": {},
   "source": [
    "#### 3.1.2. Scanpy object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fe304c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 's3_path/GSE128223_1.h5ad',\n",
    "}, {...}]\n",
    "        \n",
    "connector.submit_study_from_s3(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.H5AD.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2e716529",
   "metadata": {},
   "source": [
    "#### 3.1.3. Seurat object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a72c5aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 's3_path/GSE128223_1.rds',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_s3(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.RDS.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44ceb231",
   "metadata": {},
   "source": [
    "#### 3.1.4. Tile DB format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15c61880",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'folder': 's3_path/GSE128223_1',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_s3(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.TILE_DB.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b90c54a4",
   "metadata": {},
   "source": [
    "#### 3.1.5. Full matrix dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c7d4313",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 's3_path/GSE128223_1.tsv',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_s3(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.TSV.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "77a5cca2-708d-4e12-84a1-fb8f5e2da94e",
   "metadata": {},
   "source": [
    "### 3.2. Option 2: Submit study from local machine"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3dc20cec-48a3-45bd-910e-1b87c90f20b1",
   "metadata": {},
   "source": [
    "```\n",
    "Parameters:\n",
    "------\n",
    "group_id: str\n",
    "      ID of the group to submit the data to.\n",
    "batch_info: List[dict]\n",
    "      File path and batch name information.\n",
    "      Example:\n",
    "        For H5AD format:\n",
    "          [{\n",
    "            'matrix': 'local_path/GSE128223_1.h5ad'\n",
    "          }, {...}]\n",
    "        For RDS format:\n",
    "          [{\n",
    "            'matrix': 'local_path/GSE128223_1.rds'\n",
    "          }, {...}]\n",
    "        For MTX_10X format:\n",
    "          [{\n",
    "            'name': 'data_1',\n",
    "            'matrix': 'local_path/data_1/matrix.mtx',\n",
    "            'features': 'local_path/data_1/features.tsv',\n",
    "            'barcodes': 'local_path/data_1/barcodes.tsv',\n",
    "          }, {...}]\n",
    "study_id: str\n",
    "      If no value is provided, default id will be a random uuidv4 string\n",
    "name: str\n",
    "      Name of the study.\n",
    "authors: List[str]\n",
    "      Authors of the study.\n",
    "abstract: str\n",
    "      Abstract of the study.\n",
    "species: str\n",
    "      Species of the study.\n",
    "      Support:\n",
    "            Species.HUMAN.value\n",
    "            Species.MOUSE.value\n",
    "            Species.NON_HUMAN_PRIMATE.value\n",
    "            Species.OTHERS.value\n",
    "skip_dimred: bool\n",
    "      Skip BioTuring pipeline if set to True (only appliable when input is a scanpy/seurat object).\n",
    "input_matrix_type: str\n",
    "      Is the input matrix already normalized or not?\n",
    "      Support:\n",
    "          InputMatrixType.NORMALIZED.value (will skip BioTuring normalization, h5ad: use adata.X)\n",
    "          InputMatrixType.RAW.value (apply BioTuring normalization, h5ad: use adata.raw.X)\n",
    "study_type: int\n",
    "      Format of dataset\n",
    "      Support:\n",
    "            StudyType.BBROWSER.value\n",
    "            StudyType.H5_10X.value\n",
    "            StudyType.H5AD.value\n",
    "            StudyType.MTX_10X.value\n",
    "            StudyType.BCS.value\n",
    "            StudyType.RDS.value\n",
    "            StudyType.TSV.value\n",
    "min_counts: int\n",
    "      Minimum number of counts required for a cell to pass filtering.\n",
    "min_genes: int\n",
    "      Minimum number of genes expressed required for a cell to pass filtering.\n",
    "max_counts: int\n",
    "      Maximum number of counts required for a cell to pass filtering.\n",
    "max_genes: int\n",
    "      Maximum number of genes expressed required for a cell to pass filtering.\n",
    "mt_percentage: int\n",
    "      Maximum number of mitochondria genes percentage required for a cell to pass filtering.\n",
    "      Ranging from 0 to 100\n",
    "chunk_size: int\n",
    "      Size of each separated chunk for uploading. Default: ChunkSize.CHUNK_100_MB.value\\n\n",
    "      Support:\n",
    "            ChunkSize.CHUNK_5_MB.value\n",
    "            ChunkSize.CHUNK_100_MB.value\n",
    "            ChunkSize.CHUNK_500_MB.value\n",
    "            ChunkSize.CHUNK_1_GB.value\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e232c8c8",
   "metadata": {},
   "source": [
    "#### 3.2.1. 10X Matrix format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c6168f9e-d417-40c5-99c3-fce09036c607",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "GSE128223matrix.mtx - chunk_0: 100MMB [00:08, 12.2MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_1: 100MMB [00:09, 11.5MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_2: 100MMB [00:08, 12.4MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_3: 100MMB [00:10, 10.3MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_4: 100MMB [00:10, 10.1MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_5: 100MMB [00:11, 9.27MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_6: 100MMB [00:11, 8.90MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_7: 100MMB [00:07, 13.7MMB/s]                                                                                                                                                                                      \n",
      "GSE128223matrix.mtx - chunk_8:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 84.0M/100M [00:02<00:00, 38.6MMB/s]\n",
      "GSE128223features.tsv - chunk_0:   0%|▌                                                                                                                                                                   | 316k/100M [00:00<00:11, 8.95MMB/s]\n",
      "GSE128223barcodes.csv - chunk_0:   1%|▊                                                                                                                                                                   | 527k/100M [00:00<00:05, 17.9MMB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2023-09-26 06:15] Waiting in queue\n",
      "[2023-09-26 06:15] Reading batch: GSE128223\n",
      "[2023-09-26 06:15] Preprocessing expression matrix: 20923 cells x 35756 genes\n",
      "[2023-09-26 06:15] Filtered: 20923 cells remain\n",
      "[2023-09-26 06:15] Start processing study\n",
      "[2023-09-26 06:15] Normalizing expression matrix\n",
      "[2023-09-26 06:15] Running PCA\n",
      "[2023-09-26 06:15] Running kNN\n",
      "[2023-09-26 06:15] Running venice binarizer\n",
      "[2023-09-26 06:15] Running t-SNE\n",
      "[2023-09-26 06:15] Study was successfully submitted\n",
      "[2023-09-26 06:15] DONE!!!\n",
      "Study submitted successfully!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'name': 'GSE128223',\n",
    "    'matrix': '/data/dev/example_dataset/GSE128223/raw/matrix.mtx',\n",
    "    'features': '/data/dev/example_dataset/GSE128223/raw/features.tsv',\n",
    "    'barcodes': '/data/dev/example_dataset/GSE128223/raw/barcodes.tsv',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_local(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.MTX_10X.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a74f690",
   "metadata": {},
   "source": [
    "#### 3.2.2. Scanpy object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "451b7a39",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 'local_path/GSE128223_1.h5ad',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_local(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.H5AD.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "afdfb32c",
   "metadata": {},
   "source": [
    "#### 3.2.3. Seurat object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89e38540",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 'local_path/GSE128223_1.rds',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_local(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.RDS.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5899688d",
   "metadata": {},
   "source": [
    "#### 3.2.4. Full matrix dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e78b02c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 'local_path/GSE128223_1.tsv',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_local(\n",
    "  group_id='personal',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.TSV.value\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea3c8022",
   "metadata": {},
   "source": [
    "### 3.3. Option 3: Submit study with shared s3 of a group"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ae8914b",
   "metadata": {},
   "source": [
    "```\n",
    "Parameters:\n",
    "----\n",
    "group_id: str\n",
    "      ID of the group to submit the data to.\n",
    "shared_s3_id: str\n",
    "      ID of s3 bucket\n",
    "batch_info: List[dict]\n",
    "      File path and batch name information, the path DOES NOT include bucket path configured on platform!\n",
    "      Example:\n",
    "        For H5AD format:\n",
    "          [{\n",
    "            'matrix': 's3_path/GSE128223_1.h5ad'\n",
    "          }, {...}]\n",
    "        For RDS format:\n",
    "          [{\n",
    "            'matrix': 's3_path/GSE128223_1.rds'\n",
    "          }, {...}]\n",
    "        For MTX_10X format:\n",
    "          [{\n",
    "            'matrix': 's3_path/data_1/matrix.mtx',\n",
    "            'features': 's3_path/data_1/features.tsv',\n",
    "            'barcodes': 's3_path/data_1/barcodes.tsv',\n",
    "          }, {...}]\n",
    "        For TILE_DB format:\n",
    "          [{\n",
    "            'folder': 's3_path/GSE128223_1'\n",
    "          }, {...}]\n",
    "study_id: str\n",
    "      Will be name of study (eg: GSE128223)\n",
    "      If no value is provided, default id will be a random uuidv4 string\n",
    "name: str\n",
    "      Name of the study.\n",
    "authors: List[str]\n",
    "      Authors of the study.\n",
    "abstract: str\n",
    "      Abstract of the study.\n",
    "species: str\n",
    "      Species of the study.\n",
    "      Support:\n",
    "            Species.HUMAN.value\n",
    "            Species.MOUSE.value\n",
    "            Species.NON_HUMAN_PRIMATE.value\n",
    "            Species.OTHERS.value\n",
    "skip_dimred: Bool\n",
    "      Skip BioTuring pipeline if set to True (only appliable when input is a scanpy/seurat object).\n",
    "input_matrix_type: str\n",
    "      Is the input matrix already normalized or not?\n",
    "      Support:\n",
    "            InputMatrixType.NORMALIZED.value (will skip BioTuring normalization, h5ad: use adata.X)\n",
    "            InputMatrixType.RAW.value (apply BioTuring normalization, h5ad: use adata.raw.X)\n",
    "study_type: int\n",
    "      Format of dataset\n",
    "      Support:\n",
    "            StudyType.BBROWSER.value\n",
    "            StudyType.H5_10X.value\n",
    "            StudyType.H5AD.value\n",
    "            StudyType.MTX_10X.value\n",
    "            StudyType.BCS.value\n",
    "            StudyType.RDS.value\n",
    "            StudyType.TSV.value\n",
    "            StudyType.TILE_DB.value\n",
    "min_counts: int\n",
    "      Minimum number of counts required for a cell to pass filtering.\n",
    "min_genes: int\n",
    "      Minimum number of genes expressed required for a cell to pass filtering.\n",
    "max_counts: int\n",
    "      Maximum number of counts required for a cell to pass filtering.\n",
    "max_genes:\n",
    "      Maximum number of genes expressed required for a cell to pass filtering.\n",
    "mt_percentage: int\n",
    "      Maximum number of mitochondria genes percentage required for a cell to pass filtering.\n",
    "      Ranging from 0 to 100\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56cc0046",
   "metadata": {},
   "source": [
    "#### 3.3.1. 10X Matrix format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a72e226",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 'GSE128223/raw/matrix.mtx',\n",
    "    'features': 'GSE128223/raw/features.tsv',\n",
    "    'barcodes': 'GSE128223/raw/barcodes.tsv',\n",
    "}, {...}]\n",
    "connector.submit_study_from_shared_s3(\n",
    "  group_id='6b3cfc27fa694779a1b2a5015e438b94',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.MTX_10X.value,\n",
    "  shared_s3_id='15de18d355b4ce0a1u512a5b45c8e3c'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "309837d8",
   "metadata": {},
   "source": [
    "#### 3.3.2. Scanpy object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4eaa54ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 's3_path/GSE128223_1.h5ad',\n",
    "}, {...}]\n",
    "        \n",
    "connector.submit_study_from_shared_s3(\n",
    "  group_id='6b3cfc27fa694779a1b2a5015e438b94',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.H5AD.value,\n",
    "  shared_s3_id='15de18d355b4ce0a1u512a5b45c8e3c'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b1f0b1e",
   "metadata": {},
   "source": [
    "#### 3.3.3. Seurat object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d113363",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 's3_path/GSE128223_1.rds',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_shared_s3(\n",
    "  group_id='6b3cfc27fa694779a1b2a5015e438b94',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.RDS.value,\n",
    "  shared_s3_id='15de18d355b4ce0a1u512a5b45c8e3c'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fea38074",
   "metadata": {},
   "source": [
    "#### 3.3.4. Tile DB format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "012c2e3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'folder': 's3_path/GSE128223_1',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_shared_s3(\n",
    "  group_id='6b3cfc27fa694779a1b2a5015e438b94',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.TILE_DB.value,\n",
    "  shared_s3_id='15de18d355b4ce0a1u512a5b45c8e3c'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58e17f51",
   "metadata": {},
   "source": [
    "#### 3.3.5. Full matrix dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a3a143b",
   "metadata": {},
   "outputs": [],
   "source": [
    "## The path DOES NOT include the bucket path configured on platform\n",
    "## Support multiple batches per submission\n",
    "batch_info = [{\n",
    "    'matrix': 's3_path/GSE128223_1.tsv',\n",
    "}, {...}]\n",
    "\n",
    "connector.submit_study_from_shared_s3(\n",
    "  group_id='6b3cfc27fa694779a1b2a5015e438b94',\n",
    "  batch_info=batch_info,\n",
    "  study_id='GSE128223',\n",
    "  name='This is my first study',\n",
    "  authors=['Huy Nguyen', 'Thao Truong'],\n",
    "  species=Species.HUMAN.value,\n",
    "  input_matrix_type=InputMatrixType.RAW.value,\n",
    "  study_type=StudyType.TSV.value,\n",
    "  shared_s3_id='15de18d355b4ce0a1u512a5b45c8e3c'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b53fe51a-7a2b-4a22-ad70-0473fd7f8538",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 4. Submit metadata"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df0aec6d-9472-482e-b475-55e15cdf2395",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-success\">NOTE: Get <b>group_id</b> and <b>study_id (uuid)</b> from step <b>\"2. List groups and studies\"</b></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "026597a6-87e0-4926-8d1e-83baa57aed9e",
   "metadata": {},
   "source": [
    "### 4.1. Submit a dataframe directly "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8ed9de0-e633-4da4-9570-021f38514732",
   "metadata": {},
   "source": [
    "This is an example metadata. Barcodes column must be DataFrame.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "18383247-bb97-4950-bc3d-5c5e06b7a927",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cell type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Barcodes</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>donor1_d1_AAACCTGGTAGAGGAA</th>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>donor1_d1_AAACGGGCAGACACTT</th>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>donor1_d1_AAAGCAAAGAGTAATC</th>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>donor1_d1_AAAGCAATCATGCATG</th>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>donor1_d1_AAAGCAATCCTCAACC</th>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pbmc_8k_TTTGTCATCATGTCCC</th>\n",
       "      <td>naive CD8 T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pbmc_8k_TTTGTCATCCGATATG</th>\n",
       "      <td>naive CD8 T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pbmc_8k_TTTGTCATCGTCTGAA</th>\n",
       "      <td>monocyte</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pbmc_8k_TTTGTCATCTCGAGTA</th>\n",
       "      <td>CD8 T cell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pbmc_8k_TTTGTCATCTGCTTGC</th>\n",
       "      <td>naive CD8 T cell</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>19121 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  Cell type\n",
       "Barcodes                                                   \n",
       "donor1_d1_AAACCTGGTAGAGGAA  TCRV delta 1 gamma-delta T cell\n",
       "donor1_d1_AAACGGGCAGACACTT  TCRV delta 1 gamma-delta T cell\n",
       "donor1_d1_AAAGCAAAGAGTAATC  TCRV delta 1 gamma-delta T cell\n",
       "donor1_d1_AAAGCAATCATGCATG  TCRV delta 1 gamma-delta T cell\n",
       "donor1_d1_AAAGCAATCCTCAACC  TCRV delta 1 gamma-delta T cell\n",
       "...                                                     ...\n",
       "pbmc_8k_TTTGTCATCATGTCCC                   naive CD8 T cell\n",
       "pbmc_8k_TTTGTCATCCGATATG                   naive CD8 T cell\n",
       "pbmc_8k_TTTGTCATCGTCTGAA                           monocyte\n",
       "pbmc_8k_TTTGTCATCTCGAGTA                         CD8 T cell\n",
       "pbmc_8k_TTTGTCATCTGCTTGC                   naive CD8 T cell\n",
       "\n",
       "[19121 rows x 1 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_df = pd.read_csv('GSE128223_metadata.tsv', sep='\\t', index_col=0)\n",
    "meta_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "df777667-9090-4a8c-a7da-381d9d08a91b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Successful'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "connector.submit_metadata_from_dataframe(\n",
    "    species=Species.HUMAN.value,\n",
    "    study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "    group_id='personal',\n",
    "    df=meta_df\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f28f3b69-5847-4c94-abdc-c6aa12811ed6",
   "metadata": {},
   "source": [
    "### 4.2. Submit file from local / server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "638fa955-7096-44c4-8851-ffe1fe2b1e07",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Successful'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "connector.submit_metadata_from_local(\n",
    "    species=Species.HUMAN.value,\n",
    "    study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "    group_id='personal',\n",
    "    file_path='./GSE128223_metadata.tsv'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbafd539-da4d-43b0-b26b-dfe0c672142f",
   "metadata": {},
   "source": [
    "### 4.3. Submit file from s3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "881ea5a8-f696-42a4-b492-a400503e3d48",
   "metadata": {},
   "outputs": [],
   "source": [
    "connector.submit_metadata_from_s3(\n",
    "    species=Species.HUMAN.value,\n",
    "    study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "    group_id='personal',\n",
    "    file_path='test_bucket/GSE128223_meta.tsv'        #This path DOES NOT include the bucket path configured on platform e.g. s3://bioturing_bucket\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0f81843",
   "metadata": {},
   "source": [
    "### 4.4. Submit file from shared s3 of a group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fdc47e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "connector.submit_metadata_from_shared_s3(\n",
    "    species=Species.HUMAN.value,\n",
    "    study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "    group_id='bioturing_public_studies',              #This function DOES NOT applied for group_id='personal'\n",
    "    file_path='test_bucket/GSE128223_meta.tsv',        #This path DOES NOT include the bucket path configured on platform e.g. s3://bioturing_bucket\n",
    "\tshared_s3_id='ce26142487ed4a3697bb8902bf9d9670'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cab4732-521a-4605-a103-c36363621f46",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 5. Access study data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "133635b7-789e-4414-8b71-a48de2de42be",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-success\">NOTE: Get <b>study_id (uuid)</b> from step <b>\"2.2. List all available studies in a group\"</b></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9b81b3d-27f3-4aa4-a181-818e94843fe4",
   "metadata": {},
   "source": [
    "### 5.1. Get barcodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "34ce2be9-6bd0-4ddf-b7bc-75ee2d307c3e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['donor1_d1_AAACCTGGTAGAGGAA' 'donor1_d1_AAACGGGCAGACACTT'\n",
      " 'donor1_d1_AAAGCAAAGAGTAATC' ... 'pbmc_8k_TTTGTCATCGTCTGAA'\n",
      " 'pbmc_8k_TTTGTCATCTCGAGTA' 'pbmc_8k_TTTGTCATCTGCTTGC']\n"
     ]
    }
   ],
   "source": [
    "barcodes = np.array(connector.get_barcodes(\n",
    "  study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "  species=Species.HUMAN.value,\n",
    "))\n",
    "print(barcodes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3785035a-dd98-4fd5-b312-06698108900c",
   "metadata": {},
   "source": [
    "### 5.2. Get features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "8a377a4d-adbc-4c3a-bd32-b578217bc4d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['5S_RRNA' '5_8S_RRNA' '7SK' ... 'THRA1/BTR' 'UTAT33' 'ZSCAN5CP']\n"
     ]
    }
   ],
   "source": [
    "features = np.array(connector.get_features(\n",
    "  study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "  species=Species.HUMAN.value,\n",
    "))\n",
    "print(features)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4bd5793d-6f41-4cf0-835f-da9b7388c548",
   "metadata": {},
   "source": [
    "### 5.3. Get metadata dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "3225fb73-1ff3-42f2-ae0b-28db50ba3392",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Barcodes</th>\n",
       "      <th>Cell type</th>\n",
       "      <th>Cell type (1)</th>\n",
       "      <th>Cell type (2)</th>\n",
       "      <th>Cmv status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>donor1_d1_AAACCTGGTAGAGGAA</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>CMV+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>donor1_d1_AAACGGGCAGACACTT</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>CMV+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>donor1_d1_AAAGCAAAGAGTAATC</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>CMV+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>donor1_d1_AAAGCAATCATGCATG</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>CMV+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>donor1_d1_AAAGCAATCCTCAACC</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>TCRV delta 1 gamma-delta T cell</td>\n",
       "      <td>CMV+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Barcodes                        Cell type  \\\n",
       "0  donor1_d1_AAACCTGGTAGAGGAA  TCRV delta 1 gamma-delta T cell   \n",
       "1  donor1_d1_AAACGGGCAGACACTT  TCRV delta 1 gamma-delta T cell   \n",
       "2  donor1_d1_AAAGCAAAGAGTAATC  TCRV delta 1 gamma-delta T cell   \n",
       "3  donor1_d1_AAAGCAATCATGCATG  TCRV delta 1 gamma-delta T cell   \n",
       "4  donor1_d1_AAAGCAATCCTCAACC  TCRV delta 1 gamma-delta T cell   \n",
       "\n",
       "                     Cell type (1)                    Cell type (2) Cmv status  \n",
       "0  TCRV delta 1 gamma-delta T cell  TCRV delta 1 gamma-delta T cell       CMV+  \n",
       "1  TCRV delta 1 gamma-delta T cell  TCRV delta 1 gamma-delta T cell       CMV+  \n",
       "2  TCRV delta 1 gamma-delta T cell  TCRV delta 1 gamma-delta T cell       CMV+  \n",
       "3  TCRV delta 1 gamma-delta T cell  TCRV delta 1 gamma-delta T cell       CMV+  \n",
       "4  TCRV delta 1 gamma-delta T cell  TCRV delta 1 gamma-delta T cell       CMV+  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata = connector.get_metadata(\n",
    "  study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "  species=Species.HUMAN.value\n",
    ")\n",
    "metadata.iloc[:5, :5]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae37260a-47a5-419b-b60b-7945509bb24d",
   "metadata": {},
   "source": [
    "### 5.4. Get embeddings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a133f3b-1edb-4cc3-8156-5faec669bd42",
   "metadata": {},
   "source": [
    "#### 5.4.1. List all embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b9d5a84c-b9aa-492d-ab5f-ce452aadb53e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'embedding_id': 'bee0c214d7d44dc1882313cc803aece3',\n",
       "  'embedding_name': '_x_pca'},\n",
       " {'embedding_id': '0c856f67796b4f4b86dbedb812974ff1',\n",
       "  'embedding_name': '_x_tsne'},\n",
       " {'embedding_id': '5ab6ae13ce344381a81aa7d6afb26616',\n",
       "  'embedding_name': 'PCA (no batch corrected)'},\n",
       " {'embedding_id': '21f767838c1c4d5095249dcdab9388eb',\n",
       "  'embedding_name': 'tSNE (perplexity=30)'}]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embeddings = connector.list_all_custom_embeddings(\n",
    "  study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "  species=Species.HUMAN.value,\n",
    ")\n",
    "embeddings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "460e1bd8-3731-4804-abcb-09e80c8fc2b8",
   "metadata": {},
   "source": [
    "#### 5.4.2. Access an embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "7adbcbc2-8a98-49e2-9b03-aaf63e815d99",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-5.3032417 ,  7.8890934 ,  3.359574  , ...,  0.21355404,\n",
       "        -0.64777076, -1.6085205 ],\n",
       "       [-2.9219244 ,  0.11274821,  2.3836405 , ...,  0.06213907,\n",
       "        -0.1660905 ,  0.24691239],\n",
       "       [-5.4160094 , 12.229488  ,  7.7536416 , ..., -0.5595666 ,\n",
       "         1.1389648 ,  0.28183457],\n",
       "       ...,\n",
       "       [17.052692  ,  8.085365  , -6.64449   , ...,  0.6446202 ,\n",
       "        -0.95552135, -1.0086697 ],\n",
       "       [-2.2584836 , -3.0889986 ,  2.9076786 , ...,  1.5332366 ,\n",
       "        -0.38599294, -0.29490623],\n",
       "       [-2.2893648 , -7.0735717 ,  1.3277851 , ..., -0.13736992,\n",
       "        -1.7899635 ,  0.07911549]], dtype=float32)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chosen_embedding = connector.retrieve_custom_embedding(\n",
    "  study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "  species=Species.HUMAN.value,\n",
    "  embedding_id='bee0c214d7d44dc1882313cc803aece3',\n",
    ")\n",
    "chosen_embedding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f0f891f-cae6-4f43-98fe-ba8a6bedc049",
   "metadata": {},
   "source": [
    "### 5.5. Query genes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ada2bf19-a504-41cc-a1fa-d49a63214b6b",
   "metadata": {},
   "source": [
    "```\n",
    "Parameters:\n",
    "----\n",
    "group_id: str\n",
    "    ID of the group to submit the data to.\n",
    "study_id: str\n",
    "    If no value is provided, default id will be a random uuidv4 string\n",
    "gene_names: List[str], default=[]\n",
    "    If the value array is empty, the return value will be the whole matrix\n",
    "unit: str\n",
    "    Support:\n",
    "          StudyUnit.UNIT_LOGNORM.value\n",
    "          StudyUnit.UNIT_RAW.value\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "e9df1399-872a-42b7-b30c-8c37513bbbfe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<19121x2 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 17584 stored elements in Compressed Sparse Column format>"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene_exp = connector.query_genes(\n",
    "  study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "  species=Species.HUMAN.value,\n",
    "  gene_names=['CD3D', 'CD8A'],\n",
    "  unit=StudyUnit.UNIT_RAW.value,\n",
    ")\n",
    "gene_exp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f15ab15c-10cd-4700-811d-749267b1221c",
   "metadata": {},
   "source": [
    "## 6. Standardize your metadata"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b92008b6-a66f-4e7a-9499-c69b20b362ed",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-success\">NOTE: Get <b>group_id</b> and <b>study_id (uuid)</b> from step <b>\"2. List groups and studies\"</b></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb5b1d21-e86d-4b7b-8cc8-046b002465aa",
   "metadata": {},
   "source": [
    "### 6.1. Retrieve ontology tree"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e7d198e",
   "metadata": {},
   "source": [
    "```\n",
    "Returns\n",
    "----------\n",
    "Ontologies tree : Dict[Dict]\n",
    "  In which:\n",
    "    'name': name of the node, which will be used in further steps\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c679d760-91aa-42d5-80ab-311638a3adca",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "connector.get_ontologies_tree(\n",
    "    species=Species.HUMAN.value,\n",
    "    group_id='bioturing_public_studies'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b0bb5d9b-f382-4678-8963-7651a9f43b9b",
   "metadata": {},
   "source": [
    "### 6.2. Assign standardized terms"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b771845-9659-4f18-9c1f-05a1ca2593b3",
   "metadata": {},
   "source": [
    "```\n",
    "Parameters\n",
    "-----\n",
    "species: str\n",
    "      Species of the study.\n",
    "      Support:  Species.HUMAN.value\n",
    "                Species.MOUSE.value\n",
    "                Species.PRIMATE.value\n",
    "                Species.OTHERS.value\n",
    "group_id: str\n",
    "      ID of the group to submit the data to.\n",
    "study_id: str\n",
    "      ID of the study (uuid)\n",
    "metadata_field: str\n",
    "      column name of meta dataframe in platform (eg: author's tissue)\n",
    "metadata_value: str\n",
    "      metadata value within the metadata field (eg: normal lung)\n",
    "root_name: str\n",
    "      name of root in btr ontologies tree (eg: tissue)\n",
    "leaf_name: str\n",
    "      name of leaf in btr ontologies tree (eg: lung)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0979e63-b9cf-4b24-85d6-004cd9f3d784",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This function is only usable in a group (not 'personal')\n",
    "\n",
    "connector.assign_standardized_meta(\n",
    "    species=Species.HUMAN.value,\n",
    "    group_id='bioturing_public_studies',\n",
    "    study_id='a1558f8ed6064095be86a091a4118c4a',\n",
    "    metadata_field='Cell type',\n",
    "    metadata_value='TCRV delta 1 gamma-delta T cell',\n",
    "    root_name='cell type',\n",
    "    leaf_name='gamma-delta T cell',\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}