1a:[[["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"itemListElement\":[]}"}}],["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"BreadcrumbList\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Computational Genomics\",\"item\":\"https://library.fiveable.me/computational-genomics\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Unit 9 – RNA-seq: Transcriptome Analysis\",\"item\":\"https://library.fiveable.me/computational-genomics/unit-9\"}]}"}}]],["$","$L1b",null,{"initialReduxState":{"initialToc":{"units":[{"id":"h3K4U8fVGJ8bDzAx","name":"Unit 1 – Genome Sequencing Technologies","emoji":"📚","slug":"unit-1","hasResources":true,"resources":[{"id":"Nv6MV94TbxFyrNlK","title":"1.1 Sanger sequencing","slug":"sanger-sequencing","type":"STUDY_GUIDE","date":null},{"id":"vAlKzWD5viW9VtSo","title":"1.3 Third-generation sequencing","slug":"third-generation-sequencing","type":"STUDY_GUIDE","date":null},{"id":"mrQUsoKtDbX08BzQ","title":"1.4 Sequencing platforms and instrumentation","slug":"sequencing-platforms-instrumentation","type":"STUDY_GUIDE","date":null},{"id":"0fQd3ElhUGJ1WMyI","title":"1.5 Sequencing strategies (whole-genome, exome, targeted)","slug":"sequencing-strategies-whole-genome-exome-targeted","type":"STUDY_GUIDE","date":null},{"id":"vSg0rNdUUvcyV6JS","title":"1.6 Quality control and preprocessing of sequencing data","slug":"quality-control-preprocessing-sequencing-data","type":"STUDY_GUIDE","date":null},{"id":"71rQ1N2Zmg8s6Kg9","title":"1.2 Next-generation sequencing (NGS)","slug":"next-generation-sequencing-ngs","type":"STUDY_GUIDE","date":null}]},{"id":"qS0mGJeSG4cuSZ0c","name":"Unit 2 – Sequence Alignment & Assembly","emoji":"📚","slug":"unit-2","hasResources":true,"resources":[{"id":"6WbJNa8aceTab2w2","title":"2.4 De novo assembly","slug":"de-novo-assembly","type":"STUDY_GUIDE","date":null},{"id":"Dgzm0Hy6TU9wwrVn","title":"2.5 Reference-guided assembly","slug":"reference-guided-assembly","type":"STUDY_GUIDE","date":null},{"id":"cCUmcOk95RciXvXW","title":"2.3 Sequence assembly algorithms","slug":"sequence-assembly-algorithms","type":"STUDY_GUIDE","date":null},{"id":"CvuRmJmtBugqfnmj","title":"2.6 Genome scaffolding and gap filling","slug":"genome-scaffolding-gap-filling","type":"STUDY_GUIDE","date":null},{"id":"Qaq92dA6SwIf6cDR","title":"2.1 Pairwise sequence alignment","slug":"pairwise-sequence-alignment","type":"STUDY_GUIDE","date":null},{"id":"5SmBFhFCXiN2dfjx","title":"2.2 Multiple sequence alignment","slug":"multiple-sequence-alignment","type":"STUDY_GUIDE","date":null}]},{"id":"7ikRoAY9EqRxMkmB","name":"Unit 3 – Genomic Data: Databases and Formats","emoji":"📚","slug":"unit-3","hasResources":true,"resources":[{"id":"JPea87p3usKdZaS5","title":"3.1 GenBank and EMBL databases","slug":"genbank-embl-databases","type":"STUDY_GUIDE","date":null},{"id":"l2pVmQQZfjPwUcX6","title":"3.3 SAM/BAM and VCF formats","slug":"sambam-vcf-formats","type":"STUDY_GUIDE","date":null},{"id":"hco6BKJeHFzHQjyx","title":"3.2 FASTA and FASTQ formats","slug":"fasta-fastq-formats","type":"STUDY_GUIDE","date":null},{"id":"2jR4QgERZUfeFGoZ","title":"3.4 Gene Ontology (GO) and KEGG databases","slug":"gene-ontology-go-kegg-databases","type":"STUDY_GUIDE","date":null},{"id":"6e7nJqML2dTsueoC","title":"3.5 Genomic data management and storage","slug":"genomic-data-management-storage","type":"STUDY_GUIDE","date":null}]},{"id":"k4FnaWHVJeXh57UJ","name":"Unit 4 – Genome Annotation: Finding Genes","emoji":"📚","slug":"unit-4","hasResources":true,"resources":[{"id":"bq4EVWCZeiN7tcXh","title":"4.2 Ab initio gene prediction","slug":"ab-initio-gene-prediction","type":"STUDY_GUIDE","date":null},{"id":"izkwoON2XJvAX0P6","title":"4.3 Evidence-based gene prediction","slug":"evidence-based-gene-prediction","type":"STUDY_GUIDE","date":null},{"id":"bEYXbuvtMNEM796E","title":"4.5 Non-coding RNA annotation","slug":"non-coding-rna-annotation","type":"STUDY_GUIDE","date":null},{"id":"qtTziUsnOKfO5MIB","title":"4.4 Functional annotation of genes and proteins","slug":"functional-annotation-genes-proteins","type":"STUDY_GUIDE","date":null},{"id":"lmvuEdfMPNu2NE3m","title":"4.1 Gene structure and organization","slug":"gene-structure-organization","type":"STUDY_GUIDE","date":null}]},{"id":"fZqZLMyPmymF6hgf","name":"Unit 5 – Comparative Genomics: Evolution Insights","emoji":"📚","slug":"unit-5","hasResources":true,"resources":[{"id":"JqcieXv9ldJnXVoF","title":"5.4 Evolutionary rate estimation","slug":"evolutionary-rate-estimation","type":"STUDY_GUIDE","date":null},{"id":"qwkhEva6AKYprYkF","title":"5.5 Positive and negative selection","slug":"positive-negative-selection","type":"STUDY_GUIDE","date":null},{"id":"Zs9wKuJDfHJxvl2Y","title":"5.2 Phylogenetic analysis","slug":"phylogenetic-analysis","type":"STUDY_GUIDE","date":null},{"id":"PEED8NmGSbgPNZh8","title":"5.3 Genome alignment and synteny","slug":"genome-alignment-synteny","type":"STUDY_GUIDE","date":null},{"id":"jN3NzyeqHurvKbDx","title":"5.1 Orthology and paralogy","slug":"orthology-paralogy","type":"STUDY_GUIDE","date":null}]},{"id":"IKIXDFB2kFhRpWBr","name":"Unit 6 – Regulatory Genomics & Epigenomics","emoji":"📚","slug":"unit-6","hasResources":true,"resources":[{"id":"CrVeWEFf4Jdb9P7F","title":"6.1 Transcription factors and regulatory elements","slug":"transcription-factors-regulatory-elements","type":"STUDY_GUIDE","date":null},{"id":"YzfWiXlI8XGUBijb","title":"6.2 Chromatin structure and histone modifications","slug":"chromatin-structure-histone-modifications","type":"STUDY_GUIDE","date":null},{"id":"TjpuKK3MJT9WJaY2","title":"6.4 Chromatin immunoprecipitation (ChIP) and ChIP-seq","slug":"chromatin-immunoprecipitation-chip-chip-seq","type":"STUDY_GUIDE","date":null},{"id":"apljhrv5aSHV6Moq","title":"6.5 Enhancer-promoter interactions","slug":"enhancer-promoter-interactions","type":"STUDY_GUIDE","date":null},{"id":"H8RunDQhXRfQImam","title":"6.3 DNA methylation","slug":"dna-methylation","type":"STUDY_GUIDE","date":null}]},{"id":"ngCP8wlfCNs7BMFL","name":"Unit 7 – Structural Variation & Copy Number Analysis","emoji":"📚","slug":"unit-7","hasResources":true,"resources":[{"id":"e5JIMOVQQkzNL0oM","title":"7.1 Types of structural variations","slug":"types-structural-variations","type":"STUDY_GUIDE","date":null},{"id":"gkFxQKfvS4gWMJ46","title":"7.2 Copy number variations (CNVs)","slug":"copy-number-variations-cnvs","type":"STUDY_GUIDE","date":null},{"id":"MRB2RHP9LzYIr6tf","title":"7.5 Structural variant detection methods","slug":"structural-variant-detection-methods","type":"STUDY_GUIDE","date":null},{"id":"S5X8J6owP4deK1KH","title":"7.4 Inversions and translocations","slug":"inversions-translocations","type":"STUDY_GUIDE","date":null},{"id":"ZyGtaVCCm6mPyMDA","title":"7.3 Insertions and deletions (indels)","slug":"insertions-deletions-indels","type":"STUDY_GUIDE","date":null}]},{"id":"KrICFmBMS0kFsqLb","name":"Unit 8 – Population Genomics and GWAS","emoji":"📚","slug":"unit-8","hasResources":true,"resources":[{"id":"6bVVfn9ALT9xth8e","title":"8.1 Hardy-Weinberg equilibrium","slug":"hardy-weinberg-equilibrium","type":"STUDY_GUIDE","date":null},{"id":"a2UOEyrGtO4ozqdW","title":"8.2 Linkage disequilibrium","slug":"linkage-disequilibrium","type":"STUDY_GUIDE","date":null},{"id":"NCBju8eKM0scpnqn","title":"8.3 Population structure and admixture","slug":"population-structure-admixture","type":"STUDY_GUIDE","date":null},{"id":"Tv1TlSsbiCJ2AUVz","title":"8.4 Genome-wide association studies (GWAS)","slug":"genome-wide-association-studies-gwas","type":"STUDY_GUIDE","date":null},{"id":"VhpmcTUjYX7PqlVY","title":"8.5 Genotype imputation","slug":"genotype-imputation","type":"STUDY_GUIDE","date":null}]},{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"EmX26AxpxQtHCk2T","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","type":"STUDY_GUIDE","date":null},{"id":"Sgbz2v78n5w6Jpvf","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","type":"STUDY_GUIDE","date":null},{"id":"XgFJq4aaYf0bUT1s","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","type":"STUDY_GUIDE","date":null},{"id":"YaOPd55WlPDCFUlp","title":"9.3 Differential gene expression","slug":"differential-gene-expression","type":"STUDY_GUIDE","date":null},{"id":"6GSEt5qoBER92OkV","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","type":"STUDY_GUIDE","date":null}]},{"id":"2n1nV5ALPeAKCNnZ","name":"Unit 10 – Metagenomics & Microbiome Analysis","emoji":"📚","slug":"unit-10","hasResources":true,"resources":[{"id":"qV6j2t8Cf7jqK5FF","title":"10.1 Microbial community profiling","slug":"microbial-community-profiling","type":"STUDY_GUIDE","date":null},{"id":"bWBJWQK6rb4vvl4u","title":"10.2 16S rRNA sequencing","slug":"16s-rrna-sequencing","type":"STUDY_GUIDE","date":null},{"id":"AbgVUgkpA1WG2yZN","title":"10.5 Functional analysis of microbial communities","slug":"functional-analysis-microbial-communities","type":"STUDY_GUIDE","date":null},{"id":"ztztgVMcPcc2gahc","title":"10.3 Shotgun metagenomics","slug":"shotgun-metagenomics","type":"STUDY_GUIDE","date":null},{"id":"W1ZiaVupDeBpgxm2","title":"10.4 Metagenome assembly and binning","slug":"metagenome-assembly-binning","type":"STUDY_GUIDE","date":null}]},{"id":"Rkw1WjdHC2dHRhE9","name":"Unit 11 – Genomic Data Visualization & Analysis","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"raivLMuWfqHSK3Sq","title":"11.1 Genome browsers","slug":"genome-browsers","type":"STUDY_GUIDE","date":null},{"id":"PAMGQShhrae2Cdjp","title":"11.2 Heatmaps and clustering","slug":"heatmaps-clustering","type":"STUDY_GUIDE","date":null},{"id":"M744RJzgEiCOV0CJ","title":"11.3 Principal component analysis (PCA)","slug":"principal-component-analysis-pca","type":"STUDY_GUIDE","date":null},{"id":"L7ADV4agY8FtRgJH","title":"11.4 Network visualization","slug":"network-visualization","type":"STUDY_GUIDE","date":null},{"id":"sIv6AWYoN55yVmUo","title":"11.5 Data integration and multi-omics analysis","slug":"data-integration-multi-omics-analysis","type":"STUDY_GUIDE","date":null}]},{"id":"nmNe1iFif3JsxiVB","name":"Unit 12 – Ethical & Social Impact of Genomics","emoji":"📚","slug":"unit-12","hasResources":true,"resources":[{"id":"fednbzdoGHz5VT9l","title":"12.1 Informed consent and privacy","slug":"informed-consent-privacy","type":"STUDY_GUIDE","date":null},{"id":"LaQxvQNh1qTc5EI5","title":"12.2 Genetic discrimination","slug":"genetic-discrimination","type":"STUDY_GUIDE","date":null},{"id":"7v2GgofphNghAx0q","title":"12.3 Incidental findings and return of results","slug":"incidental-findings-return-results","type":"STUDY_GUIDE","date":null},{"id":"gVNAmJCqvPAzvEaL","title":"12.4 Ownership and sharing of genomic data","slug":"ownership-sharing-genomic-data","type":"STUDY_GUIDE","date":null},{"id":"wCAhhTjWLtUVlZf0","title":"12.5 Genomics and personalized medicine","slug":"genomics-personalized-medicine","type":"STUDY_GUIDE","date":null}]}],"activeUnit":{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"EmX26AxpxQtHCk2T","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","type":"STUDY_GUIDE","date":null},{"id":"Sgbz2v78n5w6Jpvf","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","type":"STUDY_GUIDE","date":null},{"id":"XgFJq4aaYf0bUT1s","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","type":"STUDY_GUIDE","date":null},{"id":"YaOPd55WlPDCFUlp","title":"9.3 Differential gene expression","slug":"differential-gene-expression","type":"STUDY_GUIDE","date":null},{"id":"6GSEt5qoBER92OkV","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","type":"STUDY_GUIDE","date":null}]}},"keyTerms":{"keyTerms":"$undefined"},"pageData":{"subject":{"id":"computational-genomics","name":"Computational Genomics","keyTermsActive":null,"generationMetadata":{}},"unit":{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"EmX26AxpxQtHCk2T","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","type":"STUDY_GUIDE","date":null},{"id":"Sgbz2v78n5w6Jpvf","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","type":"STUDY_GUIDE","date":null},{"id":"XgFJq4aaYf0bUT1s","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","type":"STUDY_GUIDE","date":null},{"id":"YaOPd55WlPDCFUlp","title":"9.3 Differential gene expression","slug":"differential-gene-expression","type":"STUDY_GUIDE","date":null},{"id":"6GSEt5qoBER92OkV","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","type":"STUDY_GUIDE","date":null}]},"topic":"$undefined","content":"$undefined","apQuestionData":"$undefined"},"contentQueryData":{}},"initialToc":{"units":[{"id":"h3K4U8fVGJ8bDzAx","name":"Unit 1 – Genome Sequencing Technologies","emoji":"📚","slug":"unit-1","hasResources":true,"resources":[{"id":"Nv6MV94TbxFyrNlK","title":"1.1 Sanger sequencing","slug":"sanger-sequencing","type":"STUDY_GUIDE","date":null},{"id":"vAlKzWD5viW9VtSo","title":"1.3 Third-generation sequencing","slug":"third-generation-sequencing","type":"STUDY_GUIDE","date":null},{"id":"mrQUsoKtDbX08BzQ","title":"1.4 Sequencing platforms and instrumentation","slug":"sequencing-platforms-instrumentation","type":"STUDY_GUIDE","date":null},{"id":"0fQd3ElhUGJ1WMyI","title":"1.5 Sequencing strategies (whole-genome, exome, targeted)","slug":"sequencing-strategies-whole-genome-exome-targeted","type":"STUDY_GUIDE","date":null},{"id":"vSg0rNdUUvcyV6JS","title":"1.6 Quality control and preprocessing of sequencing data","slug":"quality-control-preprocessing-sequencing-data","type":"STUDY_GUIDE","date":null},{"id":"71rQ1N2Zmg8s6Kg9","title":"1.2 Next-generation sequencing (NGS)","slug":"next-generation-sequencing-ngs","type":"STUDY_GUIDE","date":null}]},{"id":"qS0mGJeSG4cuSZ0c","name":"Unit 2 – Sequence Alignment & Assembly","emoji":"📚","slug":"unit-2","hasResources":true,"resources":[{"id":"6WbJNa8aceTab2w2","title":"2.4 De novo assembly","slug":"de-novo-assembly","type":"STUDY_GUIDE","date":null},{"id":"Dgzm0Hy6TU9wwrVn","title":"2.5 Reference-guided assembly","slug":"reference-guided-assembly","type":"STUDY_GUIDE","date":null},{"id":"cCUmcOk95RciXvXW","title":"2.3 Sequence assembly algorithms","slug":"sequence-assembly-algorithms","type":"STUDY_GUIDE","date":null},{"id":"CvuRmJmtBugqfnmj","title":"2.6 Genome scaffolding and gap filling","slug":"genome-scaffolding-gap-filling","type":"STUDY_GUIDE","date":null},{"id":"Qaq92dA6SwIf6cDR","title":"2.1 Pairwise sequence alignment","slug":"pairwise-sequence-alignment","type":"STUDY_GUIDE","date":null},{"id":"5SmBFhFCXiN2dfjx","title":"2.2 Multiple sequence alignment","slug":"multiple-sequence-alignment","type":"STUDY_GUIDE","date":null}]},{"id":"7ikRoAY9EqRxMkmB","name":"Unit 3 – Genomic Data: Databases and Formats","emoji":"📚","slug":"unit-3","hasResources":true,"resources":[{"id":"JPea87p3usKdZaS5","title":"3.1 GenBank and EMBL databases","slug":"genbank-embl-databases","type":"STUDY_GUIDE","date":null},{"id":"l2pVmQQZfjPwUcX6","title":"3.3 SAM/BAM and VCF formats","slug":"sambam-vcf-formats","type":"STUDY_GUIDE","date":null},{"id":"hco6BKJeHFzHQjyx","title":"3.2 FASTA and FASTQ formats","slug":"fasta-fastq-formats","type":"STUDY_GUIDE","date":null},{"id":"2jR4QgERZUfeFGoZ","title":"3.4 Gene Ontology (GO) and KEGG databases","slug":"gene-ontology-go-kegg-databases","type":"STUDY_GUIDE","date":null},{"id":"6e7nJqML2dTsueoC","title":"3.5 Genomic data management and storage","slug":"genomic-data-management-storage","type":"STUDY_GUIDE","date":null}]},{"id":"k4FnaWHVJeXh57UJ","name":"Unit 4 – Genome Annotation: Finding Genes","emoji":"📚","slug":"unit-4","hasResources":true,"resources":[{"id":"bq4EVWCZeiN7tcXh","title":"4.2 Ab initio gene prediction","slug":"ab-initio-gene-prediction","type":"STUDY_GUIDE","date":null},{"id":"izkwoON2XJvAX0P6","title":"4.3 Evidence-based gene prediction","slug":"evidence-based-gene-prediction","type":"STUDY_GUIDE","date":null},{"id":"bEYXbuvtMNEM796E","title":"4.5 Non-coding RNA annotation","slug":"non-coding-rna-annotation","type":"STUDY_GUIDE","date":null},{"id":"qtTziUsnOKfO5MIB","title":"4.4 Functional annotation of genes and proteins","slug":"functional-annotation-genes-proteins","type":"STUDY_GUIDE","date":null},{"id":"lmvuEdfMPNu2NE3m","title":"4.1 Gene structure and organization","slug":"gene-structure-organization","type":"STUDY_GUIDE","date":null}]},{"id":"fZqZLMyPmymF6hgf","name":"Unit 5 – Comparative Genomics: Evolution Insights","emoji":"📚","slug":"unit-5","hasResources":true,"resources":[{"id":"JqcieXv9ldJnXVoF","title":"5.4 Evolutionary rate estimation","slug":"evolutionary-rate-estimation","type":"STUDY_GUIDE","date":null},{"id":"qwkhEva6AKYprYkF","title":"5.5 Positive and negative selection","slug":"positive-negative-selection","type":"STUDY_GUIDE","date":null},{"id":"Zs9wKuJDfHJxvl2Y","title":"5.2 Phylogenetic analysis","slug":"phylogenetic-analysis","type":"STUDY_GUIDE","date":null},{"id":"PEED8NmGSbgPNZh8","title":"5.3 Genome alignment and synteny","slug":"genome-alignment-synteny","type":"STUDY_GUIDE","date":null},{"id":"jN3NzyeqHurvKbDx","title":"5.1 Orthology and paralogy","slug":"orthology-paralogy","type":"STUDY_GUIDE","date":null}]},{"id":"IKIXDFB2kFhRpWBr","name":"Unit 6 – Regulatory Genomics & Epigenomics","emoji":"📚","slug":"unit-6","hasResources":true,"resources":[{"id":"CrVeWEFf4Jdb9P7F","title":"6.1 Transcription factors and regulatory elements","slug":"transcription-factors-regulatory-elements","type":"STUDY_GUIDE","date":null},{"id":"YzfWiXlI8XGUBijb","title":"6.2 Chromatin structure and histone modifications","slug":"chromatin-structure-histone-modifications","type":"STUDY_GUIDE","date":null},{"id":"TjpuKK3MJT9WJaY2","title":"6.4 Chromatin immunoprecipitation (ChIP) and ChIP-seq","slug":"chromatin-immunoprecipitation-chip-chip-seq","type":"STUDY_GUIDE","date":null},{"id":"apljhrv5aSHV6Moq","title":"6.5 Enhancer-promoter interactions","slug":"enhancer-promoter-interactions","type":"STUDY_GUIDE","date":null},{"id":"H8RunDQhXRfQImam","title":"6.3 DNA methylation","slug":"dna-methylation","type":"STUDY_GUIDE","date":null}]},{"id":"ngCP8wlfCNs7BMFL","name":"Unit 7 – Structural Variation & Copy Number Analysis","emoji":"📚","slug":"unit-7","hasResources":true,"resources":[{"id":"e5JIMOVQQkzNL0oM","title":"7.1 Types of structural variations","slug":"types-structural-variations","type":"STUDY_GUIDE","date":null},{"id":"gkFxQKfvS4gWMJ46","title":"7.2 Copy number variations (CNVs)","slug":"copy-number-variations-cnvs","type":"STUDY_GUIDE","date":null},{"id":"MRB2RHP9LzYIr6tf","title":"7.5 Structural variant detection methods","slug":"structural-variant-detection-methods","type":"STUDY_GUIDE","date":null},{"id":"S5X8J6owP4deK1KH","title":"7.4 Inversions and translocations","slug":"inversions-translocations","type":"STUDY_GUIDE","date":null},{"id":"ZyGtaVCCm6mPyMDA","title":"7.3 Insertions and deletions (indels)","slug":"insertions-deletions-indels","type":"STUDY_GUIDE","date":null}]},{"id":"KrICFmBMS0kFsqLb","name":"Unit 8 – Population Genomics and GWAS","emoji":"📚","slug":"unit-8","hasResources":true,"resources":[{"id":"6bVVfn9ALT9xth8e","title":"8.1 Hardy-Weinberg equilibrium","slug":"hardy-weinberg-equilibrium","type":"STUDY_GUIDE","date":null},{"id":"a2UOEyrGtO4ozqdW","title":"8.2 Linkage disequilibrium","slug":"linkage-disequilibrium","type":"STUDY_GUIDE","date":null},{"id":"NCBju8eKM0scpnqn","title":"8.3 Population structure and admixture","slug":"population-structure-admixture","type":"STUDY_GUIDE","date":null},{"id":"Tv1TlSsbiCJ2AUVz","title":"8.4 Genome-wide association studies (GWAS)","slug":"genome-wide-association-studies-gwas","type":"STUDY_GUIDE","date":null},{"id":"VhpmcTUjYX7PqlVY","title":"8.5 Genotype imputation","slug":"genotype-imputation","type":"STUDY_GUIDE","date":null}]},{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"EmX26AxpxQtHCk2T","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","type":"STUDY_GUIDE","date":null},{"id":"Sgbz2v78n5w6Jpvf","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","type":"STUDY_GUIDE","date":null},{"id":"XgFJq4aaYf0bUT1s","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","type":"STUDY_GUIDE","date":null},{"id":"YaOPd55WlPDCFUlp","title":"9.3 Differential gene expression","slug":"differential-gene-expression","type":"STUDY_GUIDE","date":null},{"id":"6GSEt5qoBER92OkV","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","type":"STUDY_GUIDE","date":null}]},{"id":"2n1nV5ALPeAKCNnZ","name":"Unit 10 – Metagenomics & Microbiome Analysis","emoji":"📚","slug":"unit-10","hasResources":true,"resources":[{"id":"qV6j2t8Cf7jqK5FF","title":"10.1 Microbial community profiling","slug":"microbial-community-profiling","type":"STUDY_GUIDE","date":null},{"id":"bWBJWQK6rb4vvl4u","title":"10.2 16S rRNA sequencing","slug":"16s-rrna-sequencing","type":"STUDY_GUIDE","date":null},{"id":"AbgVUgkpA1WG2yZN","title":"10.5 Functional analysis of microbial communities","slug":"functional-analysis-microbial-communities","type":"STUDY_GUIDE","date":null},{"id":"ztztgVMcPcc2gahc","title":"10.3 Shotgun metagenomics","slug":"shotgun-metagenomics","type":"STUDY_GUIDE","date":null},{"id":"W1ZiaVupDeBpgxm2","title":"10.4 Metagenome assembly and binning","slug":"metagenome-assembly-binning","type":"STUDY_GUIDE","date":null}]},{"id":"Rkw1WjdHC2dHRhE9","name":"Unit 11 – Genomic Data Visualization & Analysis","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"raivLMuWfqHSK3Sq","title":"11.1 Genome browsers","slug":"genome-browsers","type":"STUDY_GUIDE","date":null},{"id":"PAMGQShhrae2Cdjp","title":"11.2 Heatmaps and clustering","slug":"heatmaps-clustering","type":"STUDY_GUIDE","date":null},{"id":"M744RJzgEiCOV0CJ","title":"11.3 Principal component analysis (PCA)","slug":"principal-component-analysis-pca","type":"STUDY_GUIDE","date":null},{"id":"L7ADV4agY8FtRgJH","title":"11.4 Network visualization","slug":"network-visualization","type":"STUDY_GUIDE","date":null},{"id":"sIv6AWYoN55yVmUo","title":"11.5 Data integration and multi-omics analysis","slug":"data-integration-multi-omics-analysis","type":"STUDY_GUIDE","date":null}]},{"id":"nmNe1iFif3JsxiVB","name":"Unit 12 – Ethical & Social Impact of Genomics","emoji":"📚","slug":"unit-12","hasResources":true,"resources":[{"id":"fednbzdoGHz5VT9l","title":"12.1 Informed consent and privacy","slug":"informed-consent-privacy","type":"STUDY_GUIDE","date":null},{"id":"LaQxvQNh1qTc5EI5","title":"12.2 Genetic discrimination","slug":"genetic-discrimination","type":"STUDY_GUIDE","date":null},{"id":"7v2GgofphNghAx0q","title":"12.3 Incidental findings and return of results","slug":"incidental-findings-return-results","type":"STUDY_GUIDE","date":null},{"id":"gVNAmJCqvPAzvEaL","title":"12.4 Ownership and sharing of genomic data","slug":"ownership-sharing-genomic-data","type":"STUDY_GUIDE","date":null},{"id":"wCAhhTjWLtUVlZf0","title":"12.5 Genomics and personalized medicine","slug":"genomics-personalized-medicine","type":"STUDY_GUIDE","date":null}]}],"activeUnit":{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"EmX26AxpxQtHCk2T","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","type":"STUDY_GUIDE","date":null},{"id":"Sgbz2v78n5w6Jpvf","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","type":"STUDY_GUIDE","date":null},{"id":"XgFJq4aaYf0bUT1s","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","type":"STUDY_GUIDE","date":null},{"id":"YaOPd55WlPDCFUlp","title":"9.3 Differential gene expression","slug":"differential-gene-expression","type":"STUDY_GUIDE","date":null},{"id":"6GSEt5qoBER92OkV","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","type":"STUDY_GUIDE","date":null}]},"activeSubject":{"id":"computational-genomics","name":"Computational Genomics","emoji":"🧬","slug":"computational-genomics","active":true,"keyTermsActive":null,"category":"Math & Computer Science","hasCalculators":false,"hasKeyTerms":true,"hasPracticeQuestions":false,"units":[{"id":"h3K4U8fVGJ8bDzAx","name":"Unit 1 – Genome Sequencing Technologies","emoji":"📚","slug":"unit-1","hasResources":true,"resources":[{"id":"Nv6MV94TbxFyrNlK","title":"1.1 Sanger sequencing","slug":"sanger-sequencing","type":"STUDY_GUIDE","date":null},{"id":"vAlKzWD5viW9VtSo","title":"1.3 Third-generation sequencing","slug":"third-generation-sequencing","type":"STUDY_GUIDE","date":null},{"id":"mrQUsoKtDbX08BzQ","title":"1.4 Sequencing platforms and instrumentation","slug":"sequencing-platforms-instrumentation","type":"STUDY_GUIDE","date":null},{"id":"0fQd3ElhUGJ1WMyI","title":"1.5 Sequencing strategies (whole-genome, exome, targeted)","slug":"sequencing-strategies-whole-genome-exome-targeted","type":"STUDY_GUIDE","date":null},{"id":"vSg0rNdUUvcyV6JS","title":"1.6 Quality control and preprocessing of sequencing data","slug":"quality-control-preprocessing-sequencing-data","type":"STUDY_GUIDE","date":null},{"id":"71rQ1N2Zmg8s6Kg9","title":"1.2 Next-generation sequencing (NGS)","slug":"next-generation-sequencing-ngs","type":"STUDY_GUIDE","date":null}]},{"id":"qS0mGJeSG4cuSZ0c","name":"Unit 2 – Sequence Alignment & Assembly","emoji":"📚","slug":"unit-2","hasResources":true,"resources":[{"id":"6WbJNa8aceTab2w2","title":"2.4 De novo assembly","slug":"de-novo-assembly","type":"STUDY_GUIDE","date":null},{"id":"Dgzm0Hy6TU9wwrVn","title":"2.5 Reference-guided assembly","slug":"reference-guided-assembly","type":"STUDY_GUIDE","date":null},{"id":"cCUmcOk95RciXvXW","title":"2.3 Sequence assembly algorithms","slug":"sequence-assembly-algorithms","type":"STUDY_GUIDE","date":null},{"id":"CvuRmJmtBugqfnmj","title":"2.6 Genome scaffolding and gap filling","slug":"genome-scaffolding-gap-filling","type":"STUDY_GUIDE","date":null},{"id":"Qaq92dA6SwIf6cDR","title":"2.1 Pairwise sequence alignment","slug":"pairwise-sequence-alignment","type":"STUDY_GUIDE","date":null},{"id":"5SmBFhFCXiN2dfjx","title":"2.2 Multiple sequence alignment","slug":"multiple-sequence-alignment","type":"STUDY_GUIDE","date":null}]},{"id":"7ikRoAY9EqRxMkmB","name":"Unit 3 – Genomic Data: Databases and Formats","emoji":"📚","slug":"unit-3","hasResources":true,"resources":[{"id":"JPea87p3usKdZaS5","title":"3.1 GenBank and EMBL databases","slug":"genbank-embl-databases","type":"STUDY_GUIDE","date":null},{"id":"l2pVmQQZfjPwUcX6","title":"3.3 SAM/BAM and VCF formats","slug":"sambam-vcf-formats","type":"STUDY_GUIDE","date":null},{"id":"hco6BKJeHFzHQjyx","title":"3.2 FASTA and FASTQ formats","slug":"fasta-fastq-formats","type":"STUDY_GUIDE","date":null},{"id":"2jR4QgERZUfeFGoZ","title":"3.4 Gene Ontology (GO) and KEGG databases","slug":"gene-ontology-go-kegg-databases","type":"STUDY_GUIDE","date":null},{"id":"6e7nJqML2dTsueoC","title":"3.5 Genomic data management and storage","slug":"genomic-data-management-storage","type":"STUDY_GUIDE","date":null}]},{"id":"k4FnaWHVJeXh57UJ","name":"Unit 4 – Genome Annotation: Finding Genes","emoji":"📚","slug":"unit-4","hasResources":true,"resources":[{"id":"bq4EVWCZeiN7tcXh","title":"4.2 Ab initio gene prediction","slug":"ab-initio-gene-prediction","type":"STUDY_GUIDE","date":null},{"id":"izkwoON2XJvAX0P6","title":"4.3 Evidence-based gene prediction","slug":"evidence-based-gene-prediction","type":"STUDY_GUIDE","date":null},{"id":"bEYXbuvtMNEM796E","title":"4.5 Non-coding RNA annotation","slug":"non-coding-rna-annotation","type":"STUDY_GUIDE","date":null},{"id":"qtTziUsnOKfO5MIB","title":"4.4 Functional annotation of genes and proteins","slug":"functional-annotation-genes-proteins","type":"STUDY_GUIDE","date":null},{"id":"lmvuEdfMPNu2NE3m","title":"4.1 Gene structure and organization","slug":"gene-structure-organization","type":"STUDY_GUIDE","date":null}]},{"id":"fZqZLMyPmymF6hgf","name":"Unit 5 – Comparative Genomics: Evolution Insights","emoji":"📚","slug":"unit-5","hasResources":true,"resources":[{"id":"JqcieXv9ldJnXVoF","title":"5.4 Evolutionary rate estimation","slug":"evolutionary-rate-estimation","type":"STUDY_GUIDE","date":null},{"id":"qwkhEva6AKYprYkF","title":"5.5 Positive and negative selection","slug":"positive-negative-selection","type":"STUDY_GUIDE","date":null},{"id":"Zs9wKuJDfHJxvl2Y","title":"5.2 Phylogenetic analysis","slug":"phylogenetic-analysis","type":"STUDY_GUIDE","date":null},{"id":"PEED8NmGSbgPNZh8","title":"5.3 Genome alignment and synteny","slug":"genome-alignment-synteny","type":"STUDY_GUIDE","date":null},{"id":"jN3NzyeqHurvKbDx","title":"5.1 Orthology and paralogy","slug":"orthology-paralogy","type":"STUDY_GUIDE","date":null}]},{"id":"IKIXDFB2kFhRpWBr","name":"Unit 6 – Regulatory Genomics & Epigenomics","emoji":"📚","slug":"unit-6","hasResources":true,"resources":[{"id":"CrVeWEFf4Jdb9P7F","title":"6.1 Transcription factors and regulatory elements","slug":"transcription-factors-regulatory-elements","type":"STUDY_GUIDE","date":null},{"id":"YzfWiXlI8XGUBijb","title":"6.2 Chromatin structure and histone modifications","slug":"chromatin-structure-histone-modifications","type":"STUDY_GUIDE","date":null},{"id":"TjpuKK3MJT9WJaY2","title":"6.4 Chromatin immunoprecipitation (ChIP) and ChIP-seq","slug":"chromatin-immunoprecipitation-chip-chip-seq","type":"STUDY_GUIDE","date":null},{"id":"apljhrv5aSHV6Moq","title":"6.5 Enhancer-promoter interactions","slug":"enhancer-promoter-interactions","type":"STUDY_GUIDE","date":null},{"id":"H8RunDQhXRfQImam","title":"6.3 DNA methylation","slug":"dna-methylation","type":"STUDY_GUIDE","date":null}]},{"id":"ngCP8wlfCNs7BMFL","name":"Unit 7 – Structural Variation & Copy Number Analysis","emoji":"📚","slug":"unit-7","hasResources":true,"resources":[{"id":"e5JIMOVQQkzNL0oM","title":"7.1 Types of structural variations","slug":"types-structural-variations","type":"STUDY_GUIDE","date":null},{"id":"gkFxQKfvS4gWMJ46","title":"7.2 Copy number variations (CNVs)","slug":"copy-number-variations-cnvs","type":"STUDY_GUIDE","date":null},{"id":"MRB2RHP9LzYIr6tf","title":"7.5 Structural variant detection methods","slug":"structural-variant-detection-methods","type":"STUDY_GUIDE","date":null},{"id":"S5X8J6owP4deK1KH","title":"7.4 Inversions and translocations","slug":"inversions-translocations","type":"STUDY_GUIDE","date":null},{"id":"ZyGtaVCCm6mPyMDA","title":"7.3 Insertions and deletions (indels)","slug":"insertions-deletions-indels","type":"STUDY_GUIDE","date":null}]},{"id":"KrICFmBMS0kFsqLb","name":"Unit 8 – Population Genomics and GWAS","emoji":"📚","slug":"unit-8","hasResources":true,"resources":[{"id":"6bVVfn9ALT9xth8e","title":"8.1 Hardy-Weinberg equilibrium","slug":"hardy-weinberg-equilibrium","type":"STUDY_GUIDE","date":null},{"id":"a2UOEyrGtO4ozqdW","title":"8.2 Linkage disequilibrium","slug":"linkage-disequilibrium","type":"STUDY_GUIDE","date":null},{"id":"NCBju8eKM0scpnqn","title":"8.3 Population structure and admixture","slug":"population-structure-admixture","type":"STUDY_GUIDE","date":null},{"id":"Tv1TlSsbiCJ2AUVz","title":"8.4 Genome-wide association studies (GWAS)","slug":"genome-wide-association-studies-gwas","type":"STUDY_GUIDE","date":null},{"id":"VhpmcTUjYX7PqlVY","title":"8.5 Genotype imputation","slug":"genotype-imputation","type":"STUDY_GUIDE","date":null}]},{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"EmX26AxpxQtHCk2T","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","type":"STUDY_GUIDE","date":null},{"id":"Sgbz2v78n5w6Jpvf","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","type":"STUDY_GUIDE","date":null},{"id":"XgFJq4aaYf0bUT1s","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","type":"STUDY_GUIDE","date":null},{"id":"YaOPd55WlPDCFUlp","title":"9.3 Differential gene expression","slug":"differential-gene-expression","type":"STUDY_GUIDE","date":null},{"id":"6GSEt5qoBER92OkV","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","type":"STUDY_GUIDE","date":null}]},{"id":"2n1nV5ALPeAKCNnZ","name":"Unit 10 – Metagenomics & Microbiome Analysis","emoji":"📚","slug":"unit-10","hasResources":true,"resources":[{"id":"qV6j2t8Cf7jqK5FF","title":"10.1 Microbial community profiling","slug":"microbial-community-profiling","type":"STUDY_GUIDE","date":null},{"id":"bWBJWQK6rb4vvl4u","title":"10.2 16S rRNA sequencing","slug":"16s-rrna-sequencing","type":"STUDY_GUIDE","date":null},{"id":"AbgVUgkpA1WG2yZN","title":"10.5 Functional analysis of microbial communities","slug":"functional-analysis-microbial-communities","type":"STUDY_GUIDE","date":null},{"id":"ztztgVMcPcc2gahc","title":"10.3 Shotgun metagenomics","slug":"shotgun-metagenomics","type":"STUDY_GUIDE","date":null},{"id":"W1ZiaVupDeBpgxm2","title":"10.4 Metagenome assembly and binning","slug":"metagenome-assembly-binning","type":"STUDY_GUIDE","date":null}]},{"id":"Rkw1WjdHC2dHRhE9","name":"Unit 11 – Genomic Data Visualization & Analysis","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"raivLMuWfqHSK3Sq","title":"11.1 Genome browsers","slug":"genome-browsers","type":"STUDY_GUIDE","date":null},{"id":"PAMGQShhrae2Cdjp","title":"11.2 Heatmaps and clustering","slug":"heatmaps-clustering","type":"STUDY_GUIDE","date":null},{"id":"M744RJzgEiCOV0CJ","title":"11.3 Principal component analysis (PCA)","slug":"principal-component-analysis-pca","type":"STUDY_GUIDE","date":null},{"id":"L7ADV4agY8FtRgJH","title":"11.4 Network visualization","slug":"network-visualization","type":"STUDY_GUIDE","date":null},{"id":"sIv6AWYoN55yVmUo","title":"11.5 Data integration and multi-omics analysis","slug":"data-integration-multi-omics-analysis","type":"STUDY_GUIDE","date":null}]},{"id":"nmNe1iFif3JsxiVB","name":"Unit 12 – Ethical & Social Impact of Genomics","emoji":"📚","slug":"unit-12","hasResources":true,"resources":[{"id":"fednbzdoGHz5VT9l","title":"12.1 Informed consent and privacy","slug":"informed-consent-privacy","type":"STUDY_GUIDE","date":null},{"id":"LaQxvQNh1qTc5EI5","title":"12.2 Genetic discrimination","slug":"genetic-discrimination","type":"STUDY_GUIDE","date":null},{"id":"7v2GgofphNghAx0q","title":"12.3 Incidental findings and return of results","slug":"incidental-findings-return-results","type":"STUDY_GUIDE","date":null},{"id":"gVNAmJCqvPAzvEaL","title":"12.4 Ownership and sharing of genomic data","slug":"ownership-sharing-genomic-data","type":"STUDY_GUIDE","date":null},{"id":"wCAhhTjWLtUVlZf0","title":"12.5 Genomics and personalized medicine","slug":"genomics-personalized-medicine","type":"STUDY_GUIDE","date":null}]}]}},"subjectBySlug":{"id":"computational-genomics","name":"Computational Genomics","branch":"Engineering","keyTermsActive":null,"subBranches":[{"name":"Biomedical Engineering"}],"description":"## What do you learn in Computational Genomics\n\nComputational Genomics covers the analysis of genetic data using computer algorithms. You'll learn about DNA sequencing, genome assembly, comparative genomics, and gene expression analysis. The course dives into machine learning techniques for predicting gene function and explores population genetics. You'll also get hands-on experience with bioinformatics tools and programming languages like Python or R for genomic data manipulation.\n\n## Is Computational Genomics hard?\n\nIt can be pretty challenging, not gonna lie. The mix of biology and computer science concepts can be a lot to wrap your head around. The programming part can be tough if you're not already comfortable with coding. But don't freak out - if you're into puzzles and problem-solving, you might actually find it pretty cool. Just be ready to put in some serious study time.\n\n## Tips for taking Computational Genomics in college\n\n1. Use [Fiveable Study Guides](https://fiveable.me/cram-mode) to help you cram 🌶️\n2. Practice coding regularly - don't just rely on in-class exercises\n3. Form study groups to tackle complex algorithms together\n4. Visualize genomic data using tools like IGV or Circos\n5. Stay updated with current research papers in genomics\n6. Try out online bioinformatics platforms like Galaxy or UCSC Genome Browser\n7. Watch \"GATTACA\" for a sci-fi take on genetic engineering\n8. Read \"The Genome War\" by James Shreeve for historical context\n\n## Common pre-requisites for Computational Genomics\n\n1. Introduction to Programming: Learn the basics of coding, usually in Python or Java. You'll cover fundamental concepts like variables, loops, and functions.\n\n2. Molecular Biology: Dive into the structure and function of DNA, RNA, and proteins. This class gives you the biological foundation needed for genomics.\n\n3. Statistics for Bioinformatics: Get comfortable with statistical methods used in analyzing biological data. You'll learn about probability distributions, hypothesis testing, and data visualization techniques.\n\n## Classes similar to Computational Genomics\n\n1. Bioinformatics Algorithms: Focuses on the computational methods used to analyze biological data. You'll learn about sequence alignment, phylogenetic tree construction, and protein structure prediction.\n\n2. Machine Learning for Genomics: Explores how AI and machine learning techniques can be applied to genomic data. Covers topics like clustering, classification, and deep learning for biological sequence analysis.\n\n3. Systems Biology: Looks at biological systems as a whole, integrating genomics with other -omics data. You'll learn about network analysis, metabolic modeling, and gene regulatory networks.\n\n4. Functional Genomics: Dives into methods for determining gene function on a genome-wide scale. Covers techniques like RNA-seq, ChIP-seq, and CRISPR screening.\n\n## Majors related to Computational Genomics\n\n1. Bioinformatics: Combines biology, computer science, and statistics to analyze biological data. Students learn to develop algorithms and tools for processing genomic and proteomic information.\n\n2. Computational Biology: Focuses on using mathematical and computational approaches to understand biological systems. Students study modeling biological processes and analyzing large-scale biological data.\n\n3. Bioengineering: Applies engineering principles to biological and medical systems. Students learn to design and develop new technologies for healthcare, including genomic analysis tools.\n\n4. Data Science: Concentrates on extracting insights from large datasets. Students learn statistical methods and machine learning techniques that can be applied to genomic data analysis.\n\n## What can you do with a degree in Computational Genomics?\n\n1. Bioinformatics Scientist: Develops algorithms and software tools for analyzing genomic data. They often work in research institutions or biotech companies, helping to interpret large-scale biological datasets.\n\n2. Genomic Data Analyst: Processes and interprets genomic sequencing data for various applications. They might work in clinical settings, helping to identify genetic variants associated with diseases.\n\n3. Personalized Medicine Researcher: Uses genomic data to develop tailored medical treatments. They work on identifying genetic markers that can predict drug responses or disease risks.\n\n4. Computational Biologist: Models biological systems using computer simulations. They might work on projects like predicting protein structures or simulating the spread of infectious diseases.\n\n## Computational Genomics FAQs\n\n1. Do I need to be good at both biology and computer science? It helps, but you can usually catch up in one area if you're stronger in the other. The key is being willing to learn and put in the effort.\n\n2. What programming languages are most useful? Python and R are the big ones, but some courses might use Java or C++. It's more about understanding programming concepts than mastering a specific language.\n\n3. Can I use these skills outside of academia? Absolutely! Biotech companies, pharmaceutical firms, and even some tech giants are looking for people with computational genomics skills.\n\n4. How quickly does the field change? Pretty fast! New sequencing technologies and analysis methods come out all the time. You'll need to keep learning even after you finish the course.","emoji":"🧬","order":null,"numResources":null,"active":true,"slug":"computational-genomics","generationMetadata":{"group":"Group 9 – parent key terms first","level":"college undergrad","branch":"Engineering","duration":"one semester","subBranch":null,"lengthVariant":"less text","model":"opus"}},"pageParams":{"communitySlug":"computational-genomics","unitSlug":"unit-9"},"children":["$","$L1c",null,{"subject":{"name":"Computational Genomics","emoji":"🧬","slug":"computational-genomics","category":"Math & Computer Science","active":true,"keyTermsActive":null,"generationMetadata":{"group":"Group 9 – parent key terms first","level":"college undergrad","branch":"Engineering","duration":"one semester","subBranch":null,"lengthVariant":"less text","model":"opus"},"id":"computational-genomics","order":null,"numResources":null,"description":"## What do you learn in Computational Genomics\n\nComputational Genomics covers the analysis of genetic data using computer algorithms. You'll learn about DNA sequencing, genome assembly, comparative genomics, and gene expression analysis. The course dives into machine learning techniques for predicting gene function and explores population genetics. You'll also get hands-on experience with bioinformatics tools and programming languages like Python or R for genomic data manipulation.\n\n## Is Computational Genomics hard?\n\nIt can be pretty challenging, not gonna lie. The mix of biology and computer science concepts can be a lot to wrap your head around. The programming part can be tough if you're not already comfortable with coding. But don't freak out - if you're into puzzles and problem-solving, you might actually find it pretty cool. Just be ready to put in some serious study time.\n\n## Tips for taking Computational Genomics in college\n\n1. Use [Fiveable Study Guides](https://fiveable.me/cram-mode) to help you cram 🌶️\n2. Practice coding regularly - don't just rely on in-class exercises\n3. Form study groups to tackle complex algorithms together\n4. Visualize genomic data using tools like IGV or Circos\n5. Stay updated with current research papers in genomics\n6. Try out online bioinformatics platforms like Galaxy or UCSC Genome Browser\n7. Watch \"GATTACA\" for a sci-fi take on genetic engineering\n8. Read \"The Genome War\" by James Shreeve for historical context\n\n## Common pre-requisites for Computational Genomics\n\n1. Introduction to Programming: Learn the basics of coding, usually in Python or Java. You'll cover fundamental concepts like variables, loops, and functions.\n\n2. Molecular Biology: Dive into the structure and function of DNA, RNA, and proteins. This class gives you the biological foundation needed for genomics.\n\n3. Statistics for Bioinformatics: Get comfortable with statistical methods used in analyzing biological data. You'll learn about probability distributions, hypothesis testing, and data visualization techniques.\n\n## Classes similar to Computational Genomics\n\n1. Bioinformatics Algorithms: Focuses on the computational methods used to analyze biological data. You'll learn about sequence alignment, phylogenetic tree construction, and protein structure prediction.\n\n2. Machine Learning for Genomics: Explores how AI and machine learning techniques can be applied to genomic data. Covers topics like clustering, classification, and deep learning for biological sequence analysis.\n\n3. Systems Biology: Looks at biological systems as a whole, integrating genomics with other -omics data. You'll learn about network analysis, metabolic modeling, and gene regulatory networks.\n\n4. Functional Genomics: Dives into methods for determining gene function on a genome-wide scale. Covers techniques like RNA-seq, ChIP-seq, and CRISPR screening.\n\n## Majors related to Computational Genomics\n\n1. Bioinformatics: Combines biology, computer science, and statistics to analyze biological data. Students learn to develop algorithms and tools for processing genomic and proteomic information.\n\n2. Computational Biology: Focuses on using mathematical and computational approaches to understand biological systems. Students study modeling biological processes and analyzing large-scale biological data.\n\n3. Bioengineering: Applies engineering principles to biological and medical systems. Students learn to design and develop new technologies for healthcare, including genomic analysis tools.\n\n4. Data Science: Concentrates on extracting insights from large datasets. Students learn statistical methods and machine learning techniques that can be applied to genomic data analysis.\n\n## What can you do with a degree in Computational Genomics?\n\n1. Bioinformatics Scientist: Develops algorithms and software tools for analyzing genomic data. They often work in research institutions or biotech companies, helping to interpret large-scale biological datasets.\n\n2. Genomic Data Analyst: Processes and interprets genomic sequencing data for various applications. They might work in clinical settings, helping to identify genetic variants associated with diseases.\n\n3. Personalized Medicine Researcher: Uses genomic data to develop tailored medical treatments. They work on identifying genetic markers that can predict drug responses or disease risks.\n\n4. Computational Biologist: Models biological systems using computer simulations. They might work on projects like predicting protein structures or simulating the spread of infectious diseases.\n\n## Computational Genomics FAQs\n\n1. Do I need to be good at both biology and computer science? It helps, but you can usually catch up in one area if you're stronger in the other. The key is being willing to learn and put in the effort.\n\n2. What programming languages are most useful? Python and R are the big ones, but some courses might use Java or C++. It's more about understanding programming concepts than mastering a specific language.\n\n3. Can I use these skills outside of academia? Absolutely! Biotech companies, pharmaceutical firms, and even some tech giants are looking for people with computational genomics skills.\n\n4. How quickly does the field change? Pretty fast! New sequencing technologies and analysis methods come out all the time. You'll need to keep learning even after you finish the course.","meta":{"title":"Computational Genomics – Notes and Study Guides","description":"Study guides with what you need to know for your class on Computational Genomics. Ace your next test."},"units":[{"id":"h3K4U8fVGJ8bDzAx","name":"Unit 1 – Genome Sequencing Technologies","emoji":"📚","slug":"unit-1","description":"Unit 1: Genome Sequencing Technologies","intro":"Genome sequencing technologies have revolutionized our understanding of genetics and biology. From Sanger sequencing to next-generation methods, these tools allow scientists to decode DNA with increasing speed and accuracy. They've enabled breakthroughs in disease research, personalized medicine, and evolutionary studies.\n\nAs sequencing becomes faster and cheaper, it's transforming fields like medicine and agriculture. However, challenges remain in data analysis, interpretation, and ethics. Emerging technologies like long-read and single-cell sequencing promise to further expand our genomic knowledge and applications.","overview":"## Key Concepts and Terminology\n- Genome the complete set of genetic material present in an organism\n- DNA sequencing the process of determining the precise order of nucleotides within a DNA molecule\n- Sanger sequencing a method of DNA sequencing based on the selective incorporation of chain-terminating dideoxynucleotides by DNA polymerase during in vitro DNA replication\n- Next-generation sequencing (NGS) a term used to describe several modern high-throughput sequencing technologies that enable the sequencing of large numbers of DNA molecules in parallel\n - Includes technologies such as Illumina sequencing, Ion Torrent sequencing, and Pacific Biosciences sequencing\n- Reads the short DNA sequences produced by a sequencing instrument, typically ranging from 50 to 400 base pairs in length\n- Coverage the average number of reads that align to, or \"cover,\" each base in the reference genome\n- Assembly the process of aligning and merging sequencing reads to reconstruct the original DNA sequence\n- Variant calling the process of identifying differences between the sequenced genome and a reference genome, such as single nucleotide polymorphisms (SNPs) and insertions/deletions (indels)\n\n## Historical Context and Evolution\n- DNA structure first described by James Watson and Francis Crick in 1953, based on X-ray crystallography data collected by Rosalind Franklin\n- Sanger sequencing developed by Frederick Sanger in 1977, which became the primary method for DNA sequencing for several decades\n - Sanger sequencing relies on the use of labeled chain-terminating dideoxynucleotides (ddNTPs) to generate DNA fragments of varying lengths\n - These fragments are then separated by size using gel electrophoresis, allowing the DNA sequence to be read\n- Automation and refinement of Sanger sequencing led to the completion of the Human Genome Project in 2003, which produced the first complete sequence of the human genome\n- Development of next-generation sequencing (NGS) technologies in the mid-2000s revolutionized the field by enabling high-throughput, parallel sequencing of DNA molecules\n- Continuous improvements in NGS technologies have led to increased sequencing speed, accuracy, and affordability, making large-scale genomic studies more feasible\n\n## DNA Sequencing Methods\n- Sanger sequencing the traditional method of DNA sequencing that relies on the use of labeled chain-terminating dideoxynucleotides (ddNTPs) to generate DNA fragments of varying lengths\n - DNA sample is divided into four separate sequencing reactions, each containing a different ddNTP (ddATP, ddCTP, ddGTP, or ddTTP)\n - The ddNTPs are incorporated by DNA polymerase during in vitro DNA replication, causing the termination of DNA strand elongation\n - The resulting DNA fragments are then separated by size using gel electrophoresis, allowing the DNA sequence to be read\n- Maxam-Gilbert sequencing an early DNA sequencing method that relies on chemical modification and cleavage of DNA\n - DNA sample is radiolabeled at one end and then cleaved at specific bases using chemical treatments\n - The resulting DNA fragments are separated by size using gel electrophoresis, allowing the DNA sequence to be read\n- Pyrosequencing a sequencing method that relies on the detection of pyrophosphate release during DNA synthesis\n - DNA synthesis is performed in a stepwise manner, with each nucleotide added sequentially\n - The release of pyrophosphate during nucleotide incorporation is detected using a luminescent enzyme, allowing the DNA sequence to be determined in real-time\n- Chain termination methods a class of DNA sequencing methods that rely on the use of labeled chain-terminating nucleotides to generate DNA fragments of varying lengths (includes Sanger sequencing)\n\n## Next-Generation Sequencing Technologies\n- Illumina sequencing a widely used NGS platform that relies on the use of fluorescently labeled reversible terminator nucleotides\n - DNA sample is fragmented and adapters are ligated to the ends of the fragments\n - The fragments are then amplified by PCR and attached to a solid surface (flow cell)\n - Sequencing is performed by the sequential addition of fluorescently labeled nucleotides, with each cycle of nucleotide addition followed by imaging to determine the incorporated base\n- Ion Torrent sequencing an NGS platform that relies on the detection of hydrogen ions released during DNA synthesis\n - DNA fragments are attached to a semiconductor chip and sequencing is performed by the sequential addition of unlabeled nucleotides\n - The incorporation of a nucleotide causes the release of a hydrogen ion, which is detected by a change in pH on the semiconductor chip\n- Pacific Biosciences sequencing an NGS platform that relies on the real-time observation of DNA synthesis by a single DNA polymerase molecule\n - DNA synthesis is performed using fluorescently labeled nucleotides within a zero-mode waveguide (ZMW)\n - The incorporation of each nucleotide causes a fluorescent signal that is detected in real-time, allowing the DNA sequence to be determined\n- Oxford Nanopore sequencing an NGS platform that relies on the detection of changes in electrical current as DNA molecules pass through a protein nanopore\n - DNA sample is mixed with a protein nanopore and an ionic current is passed through the nanopore\n - As DNA molecules pass through the nanopore, they cause changes in the electrical current that are characteristic of the DNA sequence\n\n## Bioinformatics Tools for Sequence Analysis\n- Quality control tools software programs used to assess the quality of sequencing data and remove low-quality reads or bases (FastQC, Trimmomatic)\n- Alignment tools software programs used to align sequencing reads to a reference genome or to each other (BWA, Bowtie, HISAT)\n - Alignment is necessary to determine the location of each read within the genome and to identify differences between the sequenced genome and the reference genome\n- Variant calling tools software programs used to identify differences between the sequenced genome and a reference genome, such as single nucleotide polymorphisms (SNPs) and insertions/deletions (indels) (GATK, SAMtools, FreeBayes)\n- Genome assembly tools software programs used to align and merge sequencing reads to reconstruct the original DNA sequence (SPAdes, Velvet, SOAPdenovo)\n - Assembly is necessary when a reference genome is not available or when the goal is to identify novel sequences or structural variations\n- Annotation tools software programs used to identify and assign biological meaning to functional elements within a genome, such as genes, regulatory regions, and non-coding RNAs (MAKER, Augustus, Prokka)\n\n## Applications in Research and Medicine\n- Disease gene discovery sequencing can be used to identify genetic variants associated with inherited disorders or complex diseases (Alzheimer's disease, cancer)\n- Personalized medicine sequencing can be used to guide treatment decisions based on an individual's genetic profile (pharmacogenomics, cancer treatment)\n- Microbial genomics sequencing can be used to study the genomes of bacteria, viruses, and other microorganisms (pathogen identification, antibiotic resistance)\n - This can aid in the development of new antibiotics, vaccines, and diagnostic tests\n- Agricultural genomics sequencing can be used to study the genomes of crops and livestock to improve traits such as yield, disease resistance, and nutritional content\n- Evolutionary studies sequencing can be used to study the evolutionary relationships between species and to identify regions of the genome that have undergone selection\n\n## Challenges and Limitations\n- High cost sequencing technologies can be expensive, particularly for large-scale studies or clinical applications\n- Data storage and management the large amounts of data generated by sequencing require significant computational resources for storage and analysis\n - This can be a challenge for smaller research groups or institutions with limited resources\n- Interpretation of variants determining the biological significance of genetic variants can be difficult, particularly for rare or novel variants\n - This requires the integration of multiple lines of evidence, including functional studies and population-level data\n- Ethical considerations sequencing can raise ethical concerns related to privacy, informed consent, and the potential for genetic discrimination\n - There are also concerns about the use of sequencing data for non-medical purposes, such as forensic investigations or ancestry testing\n- Technical limitations current sequencing technologies have limitations in terms of read length, accuracy, and the ability to sequence certain regions of the genome (repetitive regions, structural variations)\n\n## Future Trends and Emerging Technologies\n- Long-read sequencing technologies that generate reads of several kilobases or even megabases in length, allowing for improved genome assembly and the identification of structural variations (Pacific Biosciences, Oxford Nanopore)\n- Single-cell sequencing technologies that allow for the sequencing of individual cells, enabling the study of cellular heterogeneity and rare cell types\n- Spatial transcriptomics technologies that allow for the spatial mapping of gene expression within tissues, providing insights into the relationship between cellular function and spatial organization\n- Epigenomic sequencing technologies that allow for the mapping of epigenetic modifications, such as DNA methylation and histone modifications, which play important roles in gene regulation and development\n- Integration of sequencing with other omics technologies, such as proteomics and metabolomics, to provide a more comprehensive view of biological systems\n- Continued development of bioinformatics tools and databases to facilitate the analysis and interpretation of sequencing data\n- Increased use of sequencing in clinical settings for diagnosis, prognosis, and treatment selection, particularly in the areas of cancer and rare genetic disorders","active":true,"order":1,"meta":{"title":"Genome Sequencing Technologies | Computational Genomics Class Notes","description":"Study guides to review Genome Sequencing Technologies. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"Nv6MV94TbxFyrNlK","type":"STUDY_GUIDE","title":"1.1 Sanger sequencing","slug":"sanger-sequencing","date":null,"keyTopics":[],"publicId":"Nv6MV94TbxFyrNlK","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["tlyBt6nfmnKdLG9b"],"duration":10},{"id":"vAlKzWD5viW9VtSo","type":"STUDY_GUIDE","title":"1.3 Third-generation sequencing","slug":"third-generation-sequencing","date":null,"keyTopics":[],"publicId":"vAlKzWD5viW9VtSo","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["23keZENvsCWbA9R6"],"duration":8},{"id":"mrQUsoKtDbX08BzQ","type":"STUDY_GUIDE","title":"1.4 Sequencing platforms and instrumentation","slug":"sequencing-platforms-instrumentation","date":null,"keyTopics":[],"publicId":"mrQUsoKtDbX08BzQ","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["XHNUUfMoYVjneVCC"],"duration":10},{"id":"0fQd3ElhUGJ1WMyI","type":"STUDY_GUIDE","title":"1.5 Sequencing strategies (whole-genome, exome, targeted)","slug":"sequencing-strategies-whole-genome-exome-targeted","date":null,"keyTopics":[],"publicId":"0fQd3ElhUGJ1WMyI","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["EWxIiKcxixiUB3B3"],"duration":8},{"id":"vSg0rNdUUvcyV6JS","type":"STUDY_GUIDE","title":"1.6 Quality control and preprocessing of sequencing data","slug":"quality-control-preprocessing-sequencing-data","date":null,"keyTopics":[],"publicId":"vSg0rNdUUvcyV6JS","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["BJF2OkY8K4AFvLQZ"],"duration":14},{"id":"71rQ1N2Zmg8s6Kg9","type":"STUDY_GUIDE","title":"1.2 Next-generation sequencing (NGS)","slug":"next-generation-sequencing-ngs","date":null,"keyTopics":[],"publicId":"71rQ1N2Zmg8s6Kg9","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["hpob7mIKOtsxmhhr"],"duration":12}],"numResources":1},{"id":"qS0mGJeSG4cuSZ0c","name":"Unit 2 – Sequence Alignment & Assembly","emoji":"📚","slug":"unit-2","description":"Unit 2: Sequence Alignment and Assembly","intro":"Sequence alignment and assembly are fundamental techniques in computational genomics. They allow researchers to compare DNA, RNA, and protein sequences, identifying similarities that reveal functional and evolutionary relationships. These methods are crucial for understanding genetic variation and reconstructing genomes from fragmented data.\n\nFrom pairwise alignments to complex genome assemblies, these techniques employ various algorithms and tools. They enable applications like comparative genomics, variant detection, and personalized medicine. Understanding these methods is essential for interpreting genomic data and advancing our knowledge of biological systems.","overview":"## Key Concepts\n- Sequence alignment involves arranging DNA, RNA, or protein sequences to identify regions of similarity that may indicate functional, structural, or evolutionary relationships between the sequences\n- Pairwise alignment compares two sequences at a time while multiple sequence alignment compares more than two sequences simultaneously\n- Dynamic programming algorithms (Needleman-Wunsch and Smith-Waterman) guarantee optimal pairwise alignments but have high computational complexity\n- Heuristic algorithms (BLAST and FASTA) trade some accuracy for improved speed and scalability\n- Genome assembly reconstructs the original DNA sequence from numerous smaller sequenced fragments called reads\n- De novo assembly builds contigs and scaffolds without using a reference genome while reference-guided assembly aligns reads to a known reference sequence\n- Sequence alignment and genome assembly play crucial roles in various applications such as phylogenetic analysis, variant detection, and comparative genomics\n\n## Biological Background\n- DNA consists of four nucleotide bases: adenine (A), thymine (T), guanine (G), and cytosine (C)\n - Complementary base pairing occurs between A-T and G-C\n- RNA contains uracil (U) instead of thymine and plays vital roles in gene expression and regulation\n- Proteins are composed of amino acids and perform a wide range of functions in living organisms\n - The genetic code determines the relationship between nucleotide triplets (codons) and amino acids\n- Mutations can alter DNA sequences through substitutions, insertions, or deletions\n - Point mutations affect single nucleotides while structural variations involve larger segments of DNA\n- Evolutionary processes such as selection, drift, and recombination shape the diversity of DNA sequences across species\n- Conserved regions in DNA or protein sequences often indicate functional or structural importance\n\n## Sequence Alignment Basics\n- Sequence alignment arranges sequences to maximize the number of matching characters and minimize the number of gaps (insertions or deletions)\n- Matches, mismatches, and gaps are assigned scores based on their likelihood of occurrence\n - Scoring matrices (PAM and BLOSUM) provide empirically derived substitution scores for amino acids\n- Global alignment aligns entire sequences from end to end (Needleman-Wunsch algorithm)\n- Local alignment identifies the most similar regions between sequences without requiring end-to-end alignment (Smith-Waterman algorithm)\n- Gaps are introduced to account for insertions or deletions and are typically penalized in alignment scoring\n- Alignment quality is assessed using metrics such as percent identity, similarity, and gap content\n- Sequence alignment enables the identification of homologous sequences, which share a common evolutionary origin\n\n## Pairwise Alignment Algorithms\n- Dynamic programming algorithms guarantee optimal pairwise alignments by systematically exploring all possible alignments\n - Needleman-Wunsch algorithm performs global alignment and uses a scoring matrix and gap penalties to fill a dynamic programming matrix\n - Smith-Waterman algorithm performs local alignment and allows for the identification of the most similar subsequences\n- Heuristic algorithms provide faster alternatives to dynamic programming by sacrificing some accuracy\n - BLAST (Basic Local Alignment Search Tool) uses a seed-and-extend approach to identify high-scoring segment pairs (HSPs) between a query sequence and a database\n - FASTA (FAST-All) employs a k-tuple method to find initial matches and then extends them using a dynamic programming algorithm\n- Alignment parameters such as substitution matrices, gap penalties, and significance thresholds can be adjusted to optimize alignment results\n- Pairwise alignment serves as a foundation for multiple sequence alignment and homology searching\n\n## Multiple Sequence Alignment\n- Multiple sequence alignment (MSA) simultaneously aligns three or more sequences to identify conserved regions and evolutionary relationships\n- Progressive alignment algorithms (ClustalW and MUSCLE) build an MSA by progressively aligning the most similar sequences based on a guide tree\n - Guide trees are constructed using pairwise alignment scores or phylogenetic methods\n- Iterative refinement algorithms (MAFFT and T-Coffee) improve the initial MSA by repeatedly dividing and realigning subsets of sequences\n- Consistency-based methods (ProbCons and CONTRAlign) incorporate pairwise alignment information from all sequences to guide the MSA construction\n- MSA quality assessment tools (GUIDANCE and TCS) evaluate the reliability of alignments and identify potentially misaligned regions\n- Multiple sequence alignment is essential for phylogenetic analysis, protein structure prediction, and functional annotation\n\n## Genome Assembly Techniques\n- Genome assembly reconstructs the original DNA sequence from numerous smaller sequenced fragments called reads\n- Sanger sequencing produces longer reads (800-1000 bp) with higher accuracy but lower throughput compared to next-generation sequencing (NGS) technologies\n - NGS platforms (Illumina, 454, and SOLiD) generate millions of shorter reads (50-400 bp) with varying error rates and throughput\n- Overlap-layout-consensus (OLC) assembly algorithms (Celera Assembler and Arachne) identify overlaps between reads, construct a graph representation, and generate a consensus sequence\n- De Bruijn graph assemblers (Velvet and SPAdes) break reads into k-mers, build a graph based on k-mer overlaps, and traverse the graph to assemble contigs\n- Hybrid assembly approaches (MaSuRCA and Allpaths-LG) combine the strengths of different sequencing technologies and assembly algorithms\n- Scaffolding techniques (SSPACE and BESST) order and orient contigs using paired-end reads or long-range information (optical mapping and Hi-C)\n- Assembly quality metrics include N50, number of contigs, and completeness of conserved gene sets (BUSCO)\n\n## Tools and Software\n- Sequence alignment tools:\n - BLAST: widely used for local alignment and homology searching against sequence databases\n - MUSCLE: fast and accurate multiple sequence alignment program\n - T-Coffee: consistency-based MSA tool that combines information from pairwise alignments\n - MAFFT: rapid MSA algorithm with options for large-scale alignments and iterative refinement\n- Genome assembly software:\n - SPAdes: de Bruijn graph assembler for both single-cell and multi-cell sequencing data\n - Canu: long-read assembler for PacBio and Oxford Nanopore sequencing data\n - Allpaths-LG: hybrid assembler that uses both short and long reads to generate high-quality assemblies\n - QUAST: quality assessment tool for evaluating genome assemblies\n- Visualization and analysis platforms:\n - Integrative Genomics Viewer (IGV): interactive visualization tool for exploring sequence alignments and genome annotations\n - Galaxy: web-based platform for accessible, reproducible, and transparent genomic analyses\n - Bioconductor: open-source software project in R for analyzing high-throughput genomic data\n\n## Applications and Case Studies\n- Comparative genomics: sequence alignment enables the identification of conserved regions, regulatory elements, and evolutionary relationships between species\n - Example: comparing the genomes of humans and chimpanzees to study the genetic basis of human-specific traits\n- Variant detection: aligning sequencing reads to a reference genome allows for the identification of single nucleotide polymorphisms (SNPs), insertions, and deletions\n - Example: whole-exome sequencing to identify disease-causing mutations in patients with rare genetic disorders\n- Metagenomics: assembling and analyzing DNA sequences from environmental samples to study microbial communities and their functions\n - Example: investigating the role of the human gut microbiome in health and disease\n- Evolutionary studies: multiple sequence alignment and phylogenetic analysis help reconstruct the evolutionary history of genes, proteins, and species\n - Example: tracing the origin and spread of SARS-CoV-2 using genome sequences from different viral isolates\n- Personalized medicine: identifying genetic variations associated with disease risk, drug response, and treatment outcomes\n - Example: using genome sequencing to guide targeted cancer therapy based on a patient's tumor profile","active":true,"order":2,"meta":{"title":"Sequence Alignment & Assembly | Computational Genomics Class Notes","description":"Study guides to review Sequence Alignment & Assembly. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"6WbJNa8aceTab2w2","type":"STUDY_GUIDE","title":"2.4 De novo assembly","slug":"de-novo-assembly","date":null,"keyTopics":[],"publicId":"6WbJNa8aceTab2w2","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["aSST5SaJA64k5fHm"],"duration":6},{"id":"Dgzm0Hy6TU9wwrVn","type":"STUDY_GUIDE","title":"2.5 Reference-guided assembly","slug":"reference-guided-assembly","date":null,"keyTopics":[],"publicId":"Dgzm0Hy6TU9wwrVn","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["wP8WIBgHbSryzhtE"],"duration":7},{"id":"cCUmcOk95RciXvXW","type":"STUDY_GUIDE","title":"2.3 Sequence assembly algorithms","slug":"sequence-assembly-algorithms","date":null,"keyTopics":[],"publicId":"cCUmcOk95RciXvXW","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["wk3zI248dtvovY7C"],"duration":9},{"id":"CvuRmJmtBugqfnmj","type":"STUDY_GUIDE","title":"2.6 Genome scaffolding and gap filling","slug":"genome-scaffolding-gap-filling","date":null,"keyTopics":[],"publicId":"CvuRmJmtBugqfnmj","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["IWtNZTe5OaW1GIt6"],"duration":12},{"id":"Qaq92dA6SwIf6cDR","type":"STUDY_GUIDE","title":"2.1 Pairwise sequence alignment","slug":"pairwise-sequence-alignment","date":null,"keyTopics":[],"publicId":"Qaq92dA6SwIf6cDR","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["7HfsSMxPQ7M5Pkcv"],"duration":4},{"id":"5SmBFhFCXiN2dfjx","type":"STUDY_GUIDE","title":"2.2 Multiple sequence alignment","slug":"multiple-sequence-alignment","date":null,"keyTopics":[],"publicId":"5SmBFhFCXiN2dfjx","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["RBJxkbHN6nOjRjaR"],"duration":7}],"numResources":1},{"id":"7ikRoAY9EqRxMkmB","name":"Unit 3 – Genomic Data: Databases and Formats","emoji":"📚","slug":"unit-3","description":"Unit 3: Genomic Databases and Data Formats","intro":"Genomic data is the foundation of modern genetics, encompassing DNA sequences, genes, and regulatory elements. This unit explores the types of genomic data, major databases, and file formats used to store and analyze genetic information.\n\nUnderstanding genomic data is crucial for bioinformatics, which applies computational methods to interpret biological data. This unit covers key concepts like sequence alignment, assembly, and annotation, as well as tools for data manipulation and practical applications in medicine and agriculture.","overview":"## Key Concepts and Terminology\n- Genomic data encompasses the complete set of genetic information for an organism, including DNA sequences, genes, and regulatory elements\n- Bioinformatics involves the application of computational methods to analyze and interpret biological data, particularly genomic data\n- Sequence alignment is the process of comparing and aligning multiple DNA, RNA, or protein sequences to identify similarities and differences\n - Pairwise alignment compares two sequences at a time\n - Multiple sequence alignment simultaneously aligns three or more sequences\n- Sequence assembly refers to the process of reconstructing a complete DNA sequence from shorter, overlapping fragments generated by sequencing technologies\n- Annotation is the process of identifying and labeling functional elements within a genome, such as genes, regulatory regions, and non-coding RNAs\n- Ontologies are standardized vocabularies that provide a common language for describing biological concepts and relationships (Gene Ontology)\n- Data compression techniques are used to reduce the storage space required for genomic data while maintaining data integrity (gzip, bzip2)\n- Data mining involves the application of computational methods to discover patterns, associations, and insights from large genomic datasets\n\n## Types of Genomic Data\n- DNA sequencing data consists of the order of nucleotides (A, C, G, T) in a genome or specific region of interest\n - Whole genome sequencing data covers the entire genome of an organism\n - Targeted sequencing data focuses on specific regions of interest (exome sequencing)\n- RNA sequencing (RNA-seq) data provides information on the transcriptome, which is the complete set of RNA molecules expressed in a cell or tissue at a given time\n- Epigenomic data includes information on chemical modifications to DNA and histone proteins that regulate gene expression without altering the DNA sequence (DNA methylation, histone modifications)\n- Genotyping data captures genetic variations, such as single nucleotide polymorphisms (SNPs) and copy number variations (CNVs), across individuals or populations\n- Proteomic data encompasses information on the structure, function, and interactions of proteins expressed in a cell or organism\n- Metabolomic data represents the complete set of small molecules (metabolites) present in a biological sample, providing insights into cellular metabolism\n- Phenotypic data includes observable characteristics or traits of an organism, such as morphology, behavior, and disease state, which can be associated with genomic data\n\n## Major Genomic Databases\n- GenBank is a comprehensive public database maintained by the National Center for Biotechnology Information (NCBI) that stores annotated nucleotide sequences for various organisms\n- Ensembl is a joint project between the European Bioinformatics Institute (EBI) and the Wellcome Trust Sanger Institute that provides access to annotated genomes for a wide range of species\n- The European Nucleotide Archive (ENA) is a public database maintained by the EBI that stores nucleotide sequencing data and associated metadata\n- The DNA Data Bank of Japan (DDBJ) is a public database that collects, annotates, and distributes nucleotide sequence data, focusing on data from Asia and Oceania\n- The Sequence Read Archive (SRA) is a repository for raw sequencing data generated by high-throughput sequencing platforms, maintained by the NCBI, EBI, and DDBJ\n- The Genomic Data Commons (GDC) is a data repository and analysis platform developed by the National Cancer Institute (NCI) for storing and sharing cancer genomic data\n- The Gene Expression Omnibus (GEO) is a public database hosted by the NCBI that archives and distributes microarray and RNA-seq gene expression data\n- The Protein Data Bank (PDB) is a repository for 3D structural data of proteins, nucleic acids, and complex assemblies, maintained by the worldwide PDB (wwPDB) organization\n\n## Common File Formats\n- FASTA is a text-based format for representing nucleotide or amino acid sequences, where each sequence is preceded by a header line starting with \">\"\n- FASTQ is a text-based format for storing both biological sequences and their corresponding quality scores, commonly used for raw sequencing data\n- BAM (Binary Alignment Map) is a binary format for storing aligned sequencing reads, which is compressed and indexed for efficient storage and retrieval\n - SAM (Sequence Alignment Map) is the human-readable, text-based version of the BAM format\n- VCF (Variant Call Format) is a text-based format for storing genetic variation data, such as SNPs and indels, along with annotations and genotype information\n- GFF (General Feature Format) and GTF (Gene Transfer Format) are tab-delimited text formats for describing genomic features and their locations, commonly used for genome annotation\n- BED (Browser Extensible Data) is a tab-delimited text format for defining genomic regions, often used for representing genomic features or regions of interest\n- MPEG-G is a compressed file format specifically designed for efficient storage and transport of genomic data, leveraging video compression techniques\n- HDF5 (Hierarchical Data Format version 5) is a file format designed for storing large and complex datasets, including genomic data, in a hierarchical structure\n\n## Data Storage and Retrieval Methods\n- Relational databases, such as MySQL and PostgreSQL, can be used to store and manage structured genomic data, enabling efficient querying and data retrieval\n - SQL (Structured Query Language) is used to interact with relational databases\n- NoSQL databases, such as MongoDB and Cassandra, offer flexible and scalable solutions for storing and retrieving large-scale genomic data\n - These databases often use key-value, document, or column-family data models\n- Distributed file systems, like Hadoop Distributed File System (HDFS) and Lustre, enable storage and processing of massive genomic datasets across multiple computers\n- Cloud storage platforms, such as Amazon S3, Google Cloud Storage, and Microsoft Azure Blob Storage, provide scalable and cost-effective solutions for storing and accessing genomic data\n- Indexing techniques, such as hash tables and B-trees, are used to create efficient data structures that enable fast searching and retrieval of specific genomic regions or features\n- Compression algorithms, like gzip and bzip2, are employed to reduce the storage footprint of genomic data while maintaining data integrity\n- Parallel and distributed computing frameworks, such as Apache Spark and Dask, enable efficient processing and analysis of large genomic datasets across multiple nodes or clusters\n- Data lakes and data warehouses are centralized repositories that store structured and unstructured genomic data from various sources, facilitating data integration and analysis\n\n## Tools for Data Manipulation\n- Samtools is a suite of command-line utilities for manipulating and analyzing SAM/BAM files, including sorting, merging, and indexing alignments\n- Bedtools is a powerful toolset for genome arithmetic, allowing users to intersect, merge, and manipulate genomic intervals in BED, GFF, and VCF formats\n- GATK (Genome Analysis Toolkit) is a widely used framework developed by the Broad Institute for analyzing high-throughput sequencing data, with a focus on variant discovery and genotyping\n- Picard is a set of Java command-line tools for manipulating and processing sequencing data in SAM/BAM and VCF formats, including tasks like marking duplicates and calculating alignment metrics\n- Plink is a versatile command-line tool for analyzing genotype and phenotype data, with capabilities for data filtering, quality control, and association testing\n- VCFtools is a collection of Perl and C++ utilities for working with VCF files, enabling tasks such as filtering, merging, and calculating population genetic statistics\n- Hail is an open-source library for scalable genomic data analysis, built on top of Apache Spark, with support for data import, quality control, and statistical analysis\n- Bioconductor is an open-source software project in R that provides a wide range of packages for the analysis and comprehension of genomic data, including tools for data import, normalization, and visualization\n\n## Challenges and Limitations\n- Data storage and management become increasingly challenging as the volume of genomic data continues to grow exponentially, requiring scalable and cost-effective solutions\n- Data privacy and security are critical concerns when dealing with sensitive genomic information, necessitating strict access controls and secure data storage and transfer protocols\n- Data integration from multiple sources and formats can be complex and time-consuming, requiring standardized data models and interoperable tools\n- Data quality and consistency can vary across different sequencing platforms, experimental conditions, and analysis pipelines, necessitating robust quality control and data harmonization methods\n- Computational resources, such as processing power and memory, can be limiting factors when analyzing large-scale genomic datasets, requiring efficient algorithms and distributed computing approaches\n- Interpretation of genomic data is complex and often requires domain expertise, as the biological significance of genetic variations and their impact on phenotypes may not be immediately apparent\n- Reproducibility of genomic analyses can be challenging due to the rapid evolution of tools, databases, and reference genomes, emphasizing the need for detailed documentation and version control\n- Ethical considerations surrounding the use and sharing of genomic data must be addressed, including issues of informed consent, data ownership, and potential misuse of genetic information\n\n## Practical Applications\n- Precision medicine leverages genomic data to tailor medical treatments and interventions based on an individual's genetic profile, enabling more targeted and effective therapies\n- Genetic disease diagnosis and carrier screening can be improved by analyzing genomic data to identify disease-causing mutations and assess the risk of passing genetic disorders to offspring\n- Drug discovery and development can be accelerated by using genomic data to identify novel drug targets, predict drug responses, and stratify patient populations for clinical trials\n- Agricultural genomics applies genomic technologies to improve crop yields, enhance disease resistance, and develop more sustainable farming practices\n - Marker-assisted selection uses genetic markers to select plants or animals with desirable traits\n- Evolutionary and comparative genomics studies use genomic data to trace the evolutionary history of species, identify conserved genetic elements, and understand the mechanisms of adaptation and speciation\n- Microbiome research employs genomic sequencing to characterize the diverse microbial communities living in and on organisms, with applications in human health, environmental science, and biotechnology\n- Forensic genomics utilizes DNA evidence to identify individuals, establish familial relationships, and solve crimes, relying on genomic databases and advanced sequencing technologies\n- Population genomics studies the genetic variation within and between populations, providing insights into population structure, migration patterns, and the genetic basis of complex traits and diseases","active":true,"order":3,"meta":{"title":"Genomic Data: Databases and Formats | Computational Genomics Class Notes","description":"Study guides to review Genomic Data: Databases and Formats. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"JPea87p3usKdZaS5","type":"STUDY_GUIDE","title":"3.1 GenBank and EMBL databases","slug":"genbank-embl-databases","date":null,"keyTopics":[],"publicId":"JPea87p3usKdZaS5","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["Xubm4TK0tGDzy0Dy"],"duration":9},{"id":"l2pVmQQZfjPwUcX6","type":"STUDY_GUIDE","title":"3.3 SAM/BAM and VCF formats","slug":"sambam-vcf-formats","date":null,"keyTopics":[],"publicId":"l2pVmQQZfjPwUcX6","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["vTFHufkpgqI4lJiR"],"duration":10},{"id":"hco6BKJeHFzHQjyx","type":"STUDY_GUIDE","title":"3.2 FASTA and FASTQ formats","slug":"fasta-fastq-formats","date":null,"keyTopics":[],"publicId":"hco6BKJeHFzHQjyx","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["1am1wea6yoYPNURU"],"duration":9},{"id":"2jR4QgERZUfeFGoZ","type":"STUDY_GUIDE","title":"3.4 Gene Ontology (GO) and KEGG databases","slug":"gene-ontology-go-kegg-databases","date":null,"keyTopics":[],"publicId":"2jR4QgERZUfeFGoZ","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["S8RNrCDW4YxZsfb6"],"duration":10},{"id":"6e7nJqML2dTsueoC","type":"STUDY_GUIDE","title":"3.5 Genomic data management and storage","slug":"genomic-data-management-storage","date":null,"keyTopics":[],"publicId":"6e7nJqML2dTsueoC","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["n8FXNgCKC29144EM"],"duration":9}],"numResources":1},{"id":"k4FnaWHVJeXh57UJ","name":"Unit 4 – Genome Annotation: Finding Genes","emoji":"📚","slug":"unit-4","description":"Unit 4: Genome Annotation and Gene Prediction","intro":"Genome annotation is the process of identifying and labeling functional elements in genomic sequences. It involves locating protein-coding genes, non-coding RNAs, and regulatory regions, providing a roadmap for understanding an organism's genome structure and function.\n\nAnnotation bridges the gap between raw sequence data and biological understanding. It enables researchers to explore the genetic basis of traits and diseases, supports comparative genomics, and guides experimental design. This process relies on computational algorithms, experimental data, and expert curation.","overview":"## What's Genome Annotation?\n- Genome annotation involves identifying and labeling functional elements within genomic sequences\n- Includes locating protein-coding genes, non-coding RNAs, regulatory regions, and repetitive elements\n - Protein-coding genes are regions that encode for proteins essential for cellular functions\n - Non-coding RNAs (ncRNAs) do not encode proteins but have regulatory roles (miRNAs, lncRNAs)\n- Assigns biological information to genomic features based on evidence from experiments and computational predictions\n- Provides a roadmap for understanding the structure and function of an organism's genome\n- Enables researchers to explore the genetic basis of traits, diseases, and evolutionary relationships\n- Relies on a combination of experimental data, computational algorithms, and manual curation by experts\n- Continuously updated as new evidence and improved methods become available\n\n## Why We Need It\n- Raw genomic sequences alone provide limited biological insights without functional context\n- Genome annotation bridges the gap between sequence data and biological understanding\n- Enables the identification of genes and their products, which are the fundamental units of heredity and function\n- Facilitates comparative genomics by identifying conserved and species-specific elements across organisms\n- Supports the discovery of disease-associated genes and variants for medical research and diagnostics\n- Guides the design of experiments to study gene function, regulation, and interactions\n- Enhances the interpretation of high-throughput genomic data (RNA-seq, ChIP-seq) by providing a reference framework\n- Enables the development of targeted therapies, genetic engineering, and synthetic biology applications\n\n## Key Concepts and Terms\n- Genes: Segments of DNA that encode functional products (proteins or RNAs)\n- Exons: Coding regions within a gene that are retained in the mature mRNA after splicing\n- Introns: Non-coding regions within a gene that are spliced out during mRNA processing\n- Promoters: Regulatory regions upstream of genes that control transcription initiation\n- Transcription start site (TSS): The position where RNA synthesis begins in a gene\n- Untranslated regions (UTRs): Non-coding regions at the 5' and 3' ends of mRNA that regulate stability and translation\n- Open reading frame (ORF): A continuous stretch of codons that can potentially encode a protein\n- Codon: A triplet of nucleotides that specifies an amino acid or stop signal during translation\n- Splice sites: Sequences at exon-intron boundaries that guide the splicing machinery\n- Consensus sequence: Conserved nucleotide patterns associated with functional elements (splice sites, promoters)\n\n## Gene Finding Methods\n- Ab initio prediction: Uses intrinsic sequence features (codon usage, splice signals) to predict genes without relying on external evidence\n - Examples: GENSCAN, GeneID, AUGUSTUS\n- Homology-based approaches: Identify genes based on sequence similarity to known genes in other organisms\n - Relies on sequence alignment tools (BLAST, BLAT) to find conserved regions\n - Useful for annotating genes with conserved functions across species\n- Evidence-based methods: Incorporate experimental data (ESTs, RNA-seq, protein sequences) to guide gene predictions\n - Expressed sequence tags (ESTs) provide evidence of transcribed regions\n - RNA-seq data helps define exon-intron boundaries and alternative splicing events\n- Comparative genomics: Leverages conservation patterns across multiple species to identify functional elements\n - Assumes that functionally important regions are under selective pressure and more conserved\n- Combiners: Integrate predictions from multiple methods to generate consensus gene models\n - Examples: JIGSAW, EvidenceModeler, MAKER\n- Manual curation: Involves expert review and refinement of gene models based on additional evidence and biological knowledge\n\n## Tools and Software\n- BLAST (Basic Local Alignment Search Tool): Widely used for homology-based gene identification\n - Compares query sequences against databases of known genes and proteins\n- Ensembl: A comprehensive genome annotation system that integrates various evidence sources\n - Provides a web-based interface for accessing and visualizing annotated genomes\n- NCBI Genome Workbench: An integrated platform for analyzing and annotating genomic sequences\n - Offers tools for ab initio gene prediction, homology search, and evidence-based annotation\n- MAKER: A portable and configurable genome annotation pipeline\n - Combines ab initio gene predictors, homology-based methods, and experimental evidence\n- Apollo: A collaborative, web-based genome annotation editor\n - Allows manual curation and refinement of gene models by multiple users\n- InterProScan: A tool for identifying protein domains and functional motifs\n - Helps assign putative functions to predicted proteins based on conserved patterns\n- JBrowse: A fast and interactive genome browser for visualizing annotations and experimental data\n - Enables users to explore and navigate annotated genomes in a web-based interface\n\n## Challenges and Limitations\n- Incomplete or fragmented genome assemblies can hinder accurate gene identification\n- Pseudogenes and retroposed gene copies can be mistaken for functional genes\n- Short or rapidly evolving genes may be missed by homology-based methods\n- Alternative splicing and isoforms can complicate the definition of gene boundaries\n- Non-coding RNAs and regulatory elements are harder to predict than protein-coding genes\n- Insufficient experimental evidence can lead to incorrect or incomplete annotations\n- Annotation quality varies across species and genomic regions\n- Keeping annotations up-to-date with new evidence and changing knowledge is an ongoing challenge\n- Computational predictions require validation through experimental studies\n\n## Practical Applications\n- Identifying disease-associated genes and variants for diagnosis and targeted therapies\n - Example: Annotating cancer genomes to find driver mutations and potential drug targets\n- Designing targeted knockout or knockdown experiments to study gene function\n - Relies on accurate gene models to guide the selection of target regions\n- Developing genetically modified organisms for agriculture and biotechnology\n - Requires knowledge of gene structure and regulatory elements for precise modifications\n- Investigating the evolution of gene families and species-specific adaptations\n - Comparative genomics relies on consistent annotations across species\n- Guiding the interpretation of transcriptomic and proteomic data\n - Mapping RNA-seq reads and peptides to annotated genes helps quantify expression and identify novel isoforms\n- Enabling the discovery of novel biomarkers and therapeutic targets\n - Well-annotated genomes facilitate the identification of differentially expressed or mutated genes in disease states\n\n## Future Directions\n- Integrating multi-omics data (epigenomics, proteomics, metabolomics) for more comprehensive annotations\n- Developing machine learning approaches to improve the accuracy and efficiency of gene prediction\n- Expanding annotations to include tissue-specific and condition-specific gene expression patterns\n- Characterizing the functions of non-coding RNAs and their regulatory networks\n- Improving the annotation of complex genomic regions (centromeres, telomeres, repetitive elements)\n- Establishing community-driven standards and guidelines for genome annotation\n- Developing user-friendly tools and platforms for accessing and exploring annotated genomes\n- Incorporating single-cell sequencing data to capture cell type-specific gene expression and regulation","active":true,"order":4,"meta":{"title":"Genome Annotation: Finding Genes | Computational Genomics Class Notes","description":"Study guides to review Genome Annotation: Finding Genes. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"bq4EVWCZeiN7tcXh","type":"STUDY_GUIDE","title":"4.2 Ab initio gene prediction","slug":"ab-initio-gene-prediction","date":null,"keyTopics":[],"publicId":"bq4EVWCZeiN7tcXh","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["YqVYpOhgkKQcP1iW"],"duration":6},{"id":"izkwoON2XJvAX0P6","type":"STUDY_GUIDE","title":"4.3 Evidence-based gene prediction","slug":"evidence-based-gene-prediction","date":null,"keyTopics":[],"publicId":"izkwoON2XJvAX0P6","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["UrcT4zoEyZ6USSIM"],"duration":9},{"id":"bEYXbuvtMNEM796E","type":"STUDY_GUIDE","title":"4.5 Non-coding RNA annotation","slug":"non-coding-rna-annotation","date":null,"keyTopics":[],"publicId":"bEYXbuvtMNEM796E","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["i4CSi5VSiu5XT2VG"],"duration":7},{"id":"qtTziUsnOKfO5MIB","type":"STUDY_GUIDE","title":"4.4 Functional annotation of genes and proteins","slug":"functional-annotation-genes-proteins","date":null,"keyTopics":[],"publicId":"qtTziUsnOKfO5MIB","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["rqmhL73Msg7pf6gm"],"duration":12},{"id":"lmvuEdfMPNu2NE3m","type":"STUDY_GUIDE","title":"4.1 Gene structure and organization","slug":"gene-structure-organization","date":null,"keyTopics":[],"publicId":"lmvuEdfMPNu2NE3m","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["ZZgcioypP7N1FEIo"],"duration":10}],"numResources":1},{"id":"fZqZLMyPmymF6hgf","name":"Unit 5 – Comparative Genomics: Evolution Insights","emoji":"📚","slug":"unit-5","description":"Unit 5: Comparative Genomics and Evolutionary Analysis","intro":"Comparative genomics analyzes genomic sequences from different species to understand evolutionary relationships. This field explores concepts like homology, sequence alignment, and phylogenetic trees to uncover genetic changes over time. It combines evolutionary theory with modern sequencing technologies.\n\nKey applications include identifying genes linked to traits or diseases, studying antibiotic resistance, and informing conservation efforts. Future challenges involve handling increasing data volumes, integrating multi-omics approaches, and addressing ethical concerns in genomic research.","overview":"## Key Concepts and Definitions\n- Comparative genomics involves analyzing and comparing genomic sequences from different species to gain insights into evolutionary relationships and processes\n- Homologous sequences are similar due to common ancestry and can be classified as orthologs (sequences diverged by speciation) or paralogs (sequences diverged by duplication)\n- Sequence alignment is the process of arranging sequences to identify regions of similarity that may indicate functional, structural, or evolutionary relationships\n - Global alignment attempts to align entire sequences, while local alignment focuses on specific regions of high similarity\n- Phylogenetic trees represent the evolutionary relationships among organisms, with branches indicating speciation events and nodes representing common ancestors\n- Evolutionary distance measures the amount of genetic change that has occurred between sequences, often expressed as the number of nucleotide or amino acid substitutions per site\n- Positive selection occurs when beneficial mutations are favored and fixed in a population, leading to adaptive changes in the genome\n- Purifying selection removes deleterious mutations from a population, conserving functionally important regions of the genome\n\n## Evolutionary Theory Foundations\n- Darwin's theory of evolution by natural selection posits that organisms with advantageous traits are more likely to survive and reproduce, passing these traits to their offspring\n- Genetic drift is the random fluctuation of allele frequencies in a population, which can lead to the fixation or loss of alleles independently of their adaptive value\n- Hardy-Weinberg equilibrium describes a population in which allele and genotype frequencies remain constant across generations, assuming no evolutionary forces are acting upon it\n - Deviations from Hardy-Weinberg equilibrium can indicate the presence of evolutionary processes such as selection, mutation, or migration\n- Molecular clock hypothesis suggests that the rate of molecular evolution is relatively constant over time, allowing the estimation of divergence times between species\n- Neutral theory of molecular evolution proposes that most genetic changes at the molecular level are neutral and do not affect an organism's fitness\n - Under neutral theory, the rate of molecular evolution is determined primarily by the mutation rate rather than selection\n- Coalescent theory is a population genetic framework that traces the ancestry of alleles back in time to their most recent common ancestor, providing insights into population history and demography\n\n## Genomic Data Sources and Types\n- Whole genome sequencing provides the complete DNA sequence of an organism's genome, enabling comprehensive comparative analyses\n- Transcriptome sequencing (RNA-seq) captures the complete set of RNA transcripts in a cell or tissue, allowing the study of gene expression and regulation across species\n- Targeted sequencing focuses on specific regions of the genome, such as exomes or candidate genes, reducing sequencing costs and data complexity\n- Mitochondrial DNA (mtDNA) is often used in comparative genomics due to its high mutation rate, maternal inheritance, and lack of recombination\n- Bacterial and archaeal genomes are typically smaller and have higher gene density compared to eukaryotic genomes, making them valuable for studying prokaryotic evolution and diversity\n- Ancient DNA extracted from fossils or historical specimens can provide insights into the evolutionary history of extinct species and population dynamics over time\n - However, ancient DNA is often degraded and contaminated, requiring specialized techniques for sequencing and analysis\n\n## Sequence Alignment Techniques\n- Pairwise alignment compares two sequences to identify similarities and differences, using algorithms such as Needleman-Wunsch (global) or Smith-Waterman (local)\n- Multiple sequence alignment (MSA) simultaneously aligns three or more sequences, allowing the identification of conserved regions and evolutionary patterns across species\n - Progressive alignment methods (CLUSTAL, T-Coffee) build an MSA by iteratively aligning the most similar sequences and adding more divergent sequences to the growing alignment\n - Iterative refinement methods (MUSCLE, MAFFT) improve the initial MSA by repeatedly dividing the sequences into subgroups, realigning them, and merging the results\n- Scoring matrices (PAM, BLOSUM) assign scores to matches and mismatches between amino acids or nucleotides, reflecting the likelihood of substitutions based on evolutionary models\n- Gap penalties are used to discourage the introduction of gaps (insertions or deletions) in the alignment, with affine gap penalties assigning different costs to opening and extending gaps\n- Alignment quality can be assessed using measures such as percent identity, alignment length, and statistical significance (E-value)\n - Alignment visualization tools (Jalview, Aliview) facilitate the manual inspection and refinement of alignments\n\n## Phylogenetic Tree Construction\n- Distance-based methods (UPGMA, neighbor-joining) construct phylogenetic trees based on pairwise evolutionary distances between sequences\n - These methods are computationally efficient but may not always recover the true evolutionary history, especially when evolutionary rates vary among lineages\n- Maximum parsimony methods seek the tree that requires the fewest evolutionary changes to explain the observed sequence data\n - Parsimony can be sensitive to long-branch attraction, where rapidly evolving lineages are incorrectly grouped together\n- Maximum likelihood methods find the tree that maximizes the probability of observing the sequence data given a specific evolutionary model\n - Likelihood methods are statistically robust but computationally intensive, often requiring heuristic search algorithms to explore the tree space\n- Bayesian inference methods estimate the posterior probability distribution of trees based on the sequence data and prior probabilities of evolutionary models\n - Markov chain Monte Carlo (MCMC) algorithms are used to sample trees from the posterior distribution, providing a measure of uncertainty in the inferred phylogeny\n- Bootstrap analysis assesses the statistical support for each branch in a phylogenetic tree by resampling the sequence data and reconstructing trees from the resampled datasets\n- Outgroup rooting is used to determine the direction of evolution in a phylogenetic tree by including a distantly related sequence that branches off before the ingroup taxa\n\n## Comparative Genomics Tools and Software\n- BLAST (Basic Local Alignment Search Tool) is a widely used algorithm for comparing query sequences against a database of known sequences to identify homologs and infer functional relationships\n- EMBOSS (European Molecular Biology Open Software Suite) provides a comprehensive set of tools for sequence alignment, phylogenetic analysis, and genomic data manipulation\n- Bioconductor is an open-source software project for the analysis of high-throughput genomic data, offering a wide range of R packages for comparative genomics and visualization\n- Galaxy is a web-based platform for accessible, reproducible, and transparent computational research, allowing users to perform complex analyses using a graphical interface\n- Ensembl is a genome browser and database that provides access to genomic data for a wide range of species, along with comparative genomics tools and resources\n- UCSC Genome Browser is another popular genome browser that offers a variety of comparative genomics tracks and tools, including multiple sequence alignments and conservation scores\n- MEGA (Molecular Evolutionary Genetics Analysis) is a user-friendly software package for conducting sequence alignment, phylogenetic tree construction, and evolutionary analyses\n\n## Case Studies and Real-World Applications\n- Comparative genomics has been used to identify genes associated with specific traits or diseases, such as the FoxP2 gene involved in human speech and language development\n- Studying the evolution of antibiotic resistance genes in bacterial pathogens can inform strategies for combating the spread of resistance and developing new antibiotics\n - For example, comparative analyses have revealed the horizontal transfer of resistance genes between different bacterial species\n- Comparative genomics of crop plants and their wild relatives has facilitated the identification of genes related to agriculturally important traits, such as drought tolerance or disease resistance\n - This knowledge can be applied in breeding programs to develop improved crop varieties\n- Investigating the genomic basis of convergent evolution, where similar traits evolve independently in different lineages, can provide insights into the molecular mechanisms underlying adaptive evolution\n - Examples include the evolution of echolocation in bats and dolphins, or the repeated evolution of C4 photosynthesis in plants\n- Comparative genomics has been used to study the evolutionary history and population dynamics of endangered species, informing conservation efforts\n - For instance, analyzing the genetic diversity and demographic history of mountain gorillas has helped guide strategies for their protection and management\n\n## Future Directions and Challenges\n- Advances in sequencing technologies, such as long-read sequencing and single-cell sequencing, will continue to improve the quality and completeness of genomic data for comparative analyses\n- Developing more efficient algorithms and computational methods for handling the ever-increasing volume of genomic data remains an ongoing challenge\n- Integrating comparative genomics with other omics data (transcriptomics, proteomics, metabolomics) will provide a more comprehensive understanding of evolutionary processes and their functional consequences\n- Expanding comparative genomics studies to include a broader range of taxa, particularly underrepresented groups like invertebrates and microorganisms, will deepen our understanding of life's diversity and evolution\n- Improving methods for inferring and interpreting complex evolutionary scenarios, such as incomplete lineage sorting, introgression, and horizontal gene transfer, is an active area of research\n- Translating comparative genomics findings into practical applications, such as personalized medicine, conservation, and biotechnology, will require interdisciplinary collaborations and effective communication between researchers and stakeholders\n- Addressing ethical and social implications of comparative genomics research, particularly when studying human populations or culturally significant species, will be essential for responsible and equitable scientific progress","active":true,"order":5,"meta":{"title":"Comparative Genomics: Evolution Insights | Computational Genomics Class Notes","description":"Study guides to review Comparative Genomics: Evolution Insights. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"JqcieXv9ldJnXVoF","type":"STUDY_GUIDE","title":"5.4 Evolutionary rate estimation","slug":"evolutionary-rate-estimation","date":null,"keyTopics":[],"publicId":"JqcieXv9ldJnXVoF","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["8VPN09hbKrumLVRQ"],"duration":7},{"id":"qwkhEva6AKYprYkF","type":"STUDY_GUIDE","title":"5.5 Positive and negative selection","slug":"positive-negative-selection","date":null,"keyTopics":[],"publicId":"qwkhEva6AKYprYkF","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["9dL040tpIigaAVaa"],"duration":9},{"id":"Zs9wKuJDfHJxvl2Y","type":"STUDY_GUIDE","title":"5.2 Phylogenetic analysis","slug":"phylogenetic-analysis","date":null,"keyTopics":[],"publicId":"Zs9wKuJDfHJxvl2Y","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["RuBzYcqr4RIilAvL"],"duration":10},{"id":"PEED8NmGSbgPNZh8","type":"STUDY_GUIDE","title":"5.3 Genome alignment and synteny","slug":"genome-alignment-synteny","date":null,"keyTopics":[],"publicId":"PEED8NmGSbgPNZh8","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["NRnAHJ6NsPgNFDZx"],"duration":12},{"id":"jN3NzyeqHurvKbDx","type":"STUDY_GUIDE","title":"5.1 Orthology and paralogy","slug":"orthology-paralogy","date":null,"keyTopics":[],"publicId":"jN3NzyeqHurvKbDx","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["E8KG1GRGYCwJ2kml"],"duration":9}],"numResources":1},{"id":"IKIXDFB2kFhRpWBr","name":"Unit 6 – Regulatory Genomics & Epigenomics","emoji":"📚","slug":"unit-6","description":"Unit 6: Regulatory Genomics and Epigenomics","intro":"Regulatory genomics and epigenomics explore how genes are controlled without changing DNA. These fields study elements like enhancers and silencers, as well as modifications like DNA methylation and histone changes that affect gene expression.\n\nUnderstanding these mechanisms is crucial for grasping how cells function and differentiate. This knowledge has applications in medicine, helping explain disease origins and develop new treatments targeting gene regulation processes.","overview":"## Key Concepts and Definitions\n- Regulatory genomics studies how gene expression is controlled by various regulatory elements and mechanisms\n- Epigenomics focuses on heritable changes in gene expression that do not involve alterations to the DNA sequence itself\n- Transcription factors (TFs) are proteins that bind to specific DNA sequences and regulate transcription of target genes\n- Enhancers are distal regulatory elements that positively regulate gene expression by interacting with promoters through DNA looping\n - Can be located upstream or downstream of the genes they regulate (e.g., the sonic hedgehog enhancer located ~1 Mb upstream of the SHH gene)\n- Silencers are regulatory elements that negatively regulate gene expression by recruiting repressive factors\n- Insulators are boundary elements that prevent inappropriate interactions between neighboring chromatin domains\n- Chromatin accessibility refers to the degree to which DNA is accessible to TFs and other regulatory proteins\n - Open chromatin regions are associated with active gene expression, while closed chromatin is associated with gene repression\n\n## Regulatory Elements in the Genome\n- Promoters are located near the transcription start site (TSS) and contain binding sites for RNA polymerase and general TFs\n- Proximal promoters are located immediately upstream of the TSS and contain core promoter elements (TATA box, initiator, downstream promoter element)\n- Distal promoters are located further upstream and contain additional regulatory elements (CpG islands, upstream activating sequences)\n- Enhancers can be located far from their target genes and interact with promoters through DNA looping mediated by cohesin and mediator complexes\n - Super-enhancers are clusters of enhancers that drive high levels of gene expression in cell type-specific manner (e.g., the α-globin super-enhancer in erythroid cells)\n- Silencers can be located near or far from their target genes and recruit repressive factors (histone deacetylases, polycomb group proteins)\n- Insulators prevent enhancer-promoter interactions and chromatin spreading by forming loops and interacting with nuclear lamina\n - CTCF is a key insulator-binding protein that mediates chromatin looping and TAD formation\n\n## Epigenetic Modifications and Mechanisms\n- DNA methylation involves the addition of methyl groups to cytosine residues, primarily at CpG dinucleotides\n - Methylation of promoter CpG islands is associated with gene silencing, while methylation of gene bodies is associated with active transcription\n- Histone modifications include acetylation, methylation, phosphorylation, and ubiquitination of histone tails\n - H3K4me3 is associated with active promoters, while H3K27me3 is associated with repressed promoters and enhancers\n - H3K27ac is associated with active enhancers and distinguishes them from poised enhancers marked by H3K4me1 alone\n- Chromatin remodeling involves the ATP-dependent alteration of nucleosome positioning and composition by remodeling complexes (SWI/SNF, ISWI, CHD, INO80)\n- Non-coding RNAs (ncRNAs) can regulate gene expression through various mechanisms\n - Long non-coding RNAs (lncRNAs) can recruit chromatin-modifying complexes, act as enhancer RNAs, or serve as scaffolds for protein complexes (e.g., XIST lncRNA in X chromosome inactivation)\n - microRNAs (miRNAs) can post-transcriptionally repress gene expression by targeting mRNAs for degradation or translational repression\n\n## Experimental Techniques in Regulatory Genomics\n- Chromatin immunoprecipitation followed by sequencing (ChIP-seq) allows genome-wide mapping of protein-DNA interactions\n - Used to identify binding sites of TFs, histone modifications, and chromatin-associated proteins\n - Requires antibodies specific to the protein of interest and sufficient cell numbers for robust signal\n- DNase-seq and ATAC-seq identify regions of open chromatin by digesting accessible DNA with DNase I or transposase, respectively\n - Open chromatin regions are indicative of active regulatory elements (promoters, enhancers) and TF binding sites\n- Bisulfite sequencing determines DNA methylation patterns by converting unmethylated cytosines to uracil while leaving methylated cytosines unchanged\n - Whole-genome bisulfite sequencing (WGBS) provides single-base resolution methylation profiles, but is costly and requires high coverage\n - Reduced representation bisulfite sequencing (RRBS) focuses on CpG-rich regions and is more cost-effective\n- Chromosome conformation capture (3C) techniques detect long-range chromatin interactions\n - Hi-C provides genome-wide interaction maps at kilobase to megabase resolution, revealing topologically associating domains (TADs) and chromatin loops\n - Chromatin interaction analysis by paired-end tag sequencing (ChIA-PET) identifies interactions mediated by specific proteins (e.g., CTCF, RNA polymerase II)\n\n## Computational Methods for Epigenomic Analysis\n- Peak calling identifies enriched regions in ChIP-seq and accessibility data using algorithms (MACS2, HOMER, F-Seq) that compare signal to background\n- Differential analysis identifies regions with significant differences in signal intensity between conditions or cell types using tools like DiffBind and DESeq2\n- Chromatin state annotation segments the genome into distinct states (active promoter, strong enhancer, repressed, etc.) based on combinatorial histone modification patterns using hidden Markov models (ChromHMM, Segway)\n- Motif analysis identifies enriched DNA sequence motifs in regulatory regions using de novo discovery tools (MEME, DREME) or known motif scanning (FIMO, HOMER)\n - Motif instances can be used to infer TF binding and construct regulatory networks\n- Integrative analysis combines multiple data types (e.g., ChIP-seq, RNA-seq, Hi-C) to gain insights into regulatory mechanisms and predict functional effects of genetic variants\n - Tools like GREAT and RegulomeDB annotate variants with regulatory information and predict their impact on gene expression\n- Machine learning approaches (e.g., deep learning) are increasingly used to predict regulatory elements, chromatin states, and gene expression from DNA sequence and epigenomic data\n\n## Regulatory Networks and Gene Expression\n- Gene regulatory networks (GRNs) describe the complex interactions between TFs and their target genes that control cell type-specific gene expression programs\n - Can be constructed using TF binding data, gene expression data, and computational inference methods (e.g., ARACNE, GENIE3)\n- Transcriptional regulation involves the interplay between TFs, co-factors, and chromatin state to control the initiation and rate of transcription\n - General TFs (TFIIA, TFIIB, TFIID, TFIIE, TFIIF, TFIIH) assemble at the promoter to form the pre-initiation complex and recruit RNA polymerase II\n - Sequence-specific TFs bind to enhancers and promoters to activate or repress transcription by recruiting co-activators or co-repressors\n- Post-transcriptional regulation modulates gene expression through mRNA processing, stability, and translation\n - Alternative splicing generates transcript isoforms with different functions or stability\n - miRNAs and RNA-binding proteins (RBPs) regulate mRNA stability and translation efficiency\n- Feedback loops and feed-forward loops are common motifs in GRNs that enable precise control of gene expression dynamics\n - Negative feedback loops confer robustness and homeostasis, while positive feedback loops amplify signals and generate switch-like responses\n\n## Applications in Health and Disease\n- Genome-wide association studies (GWAS) have identified numerous disease-associated variants, many of which lie in non-coding regulatory regions\n - Integrating GWAS data with epigenomic data can help prioritize causal variants and elucidate their functional effects on gene regulation\n- Epigenetic alterations are implicated in various diseases, including cancer, neurodevelopmental disorders, and autoimmune diseases\n - DNA methylation changes and aberrant histone modifications can lead to altered gene expression and disease progression\n - Epigenetic drugs targeting DNA methyltransferases (DNMTs) and histone deacetylases (HDACs) are used in cancer treatment (e.g., azacitidine, vorinostat)\n- Personalized medicine approaches leverage epigenomic data to stratify patients, predict drug responses, and develop targeted therapies\n - Epigenetic biomarkers can be used for early detection, prognosis, and treatment selection in various diseases\n- Epigenetic inheritance and transgenerational effects are areas of active research\n - Epigenetic modifications can be inherited across generations and may contribute to disease risk and evolutionary adaptation\n - Environmental factors (diet, stress, toxins) can induce epigenetic changes that affect offspring health and development\n\n## Emerging Trends and Future Directions\n- Single-cell epigenomics technologies (scRNA-seq, scATAC-seq, scBS-seq) enable the study of epigenetic heterogeneity and cell type-specific regulatory landscapes\n - Help identify rare cell types, developmental trajectories, and epigenetic states associated with disease\n- Spatial epigenomics methods (e.g., spatially resolved ChIP-seq, spatial transcriptomics) provide information on the spatial organization of regulatory elements and gene expression in tissues\n- CRISPR-based epigenome editing tools allow targeted manipulation of DNA methylation and histone modifications at specific loci\n - Used to dissect the functional roles of epigenetic modifications and regulatory elements in gene regulation and disease\n- Integration of multi-omics data (epigenomics, transcriptomics, proteomics, metabolomics) using systems biology approaches will provide a more comprehensive understanding of gene regulation and its impact on cellular phenotypes\n- Machine learning and artificial intelligence will play an increasingly important role in analyzing and interpreting large-scale epigenomic datasets\n - Deep learning models can predict epigenetic states, gene expression, and disease outcomes from DNA sequence and other features\n- Comparative epigenomics across species will shed light on the evolution of regulatory mechanisms and their role in adaptation and speciation\n- Epigenetic clocks based on DNA methylation patterns can predict biological age and are being developed as biomarkers of aging and disease risk","active":true,"order":6,"meta":{"title":"Regulatory Genomics & Epigenomics | Computational Genomics Class Notes","description":"Study guides to review Regulatory Genomics & Epigenomics. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"CrVeWEFf4Jdb9P7F","type":"STUDY_GUIDE","title":"6.1 Transcription factors and regulatory elements","slug":"transcription-factors-regulatory-elements","date":null,"keyTopics":[],"publicId":"CrVeWEFf4Jdb9P7F","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["Ipj1yJ5onEiTeQ1q"],"duration":12},{"id":"YzfWiXlI8XGUBijb","type":"STUDY_GUIDE","title":"6.2 Chromatin structure and histone modifications","slug":"chromatin-structure-histone-modifications","date":null,"keyTopics":[],"publicId":"YzfWiXlI8XGUBijb","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["uw0oZMpud5qFl5xf"],"duration":8},{"id":"TjpuKK3MJT9WJaY2","type":"STUDY_GUIDE","title":"6.4 Chromatin immunoprecipitation (ChIP) and ChIP-seq","slug":"chromatin-immunoprecipitation-chip-chip-seq","date":null,"keyTopics":[],"publicId":"TjpuKK3MJT9WJaY2","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["ldCU09JwO681XHF9"],"duration":8},{"id":"apljhrv5aSHV6Moq","type":"STUDY_GUIDE","title":"6.5 Enhancer-promoter interactions","slug":"enhancer-promoter-interactions","date":null,"keyTopics":[],"publicId":"apljhrv5aSHV6Moq","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["kypTP9sWSMmpbRy9"],"duration":11},{"id":"H8RunDQhXRfQImam","type":"STUDY_GUIDE","title":"6.3 DNA methylation","slug":"dna-methylation","date":null,"keyTopics":[],"publicId":"H8RunDQhXRfQImam","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["8GnKRrTqnNqRhukj"],"duration":7}],"numResources":1},{"id":"ngCP8wlfCNs7BMFL","name":"Unit 7 – Structural Variation & Copy Number Analysis","emoji":"📚","slug":"unit-7","description":"Unit 7: Structural Variation and Copy Number Analysis","intro":"Structural variations and copy number analysis are crucial aspects of genomic research. These large-scale changes in DNA structure, including deletions, duplications, and inversions, play significant roles in genetic diversity and disease susceptibility. Understanding these variations helps unravel genome evolution and disease mechanisms.\n\nDetection methods for structural variants have evolved from microarray-based techniques to advanced next-generation sequencing technologies. Bioinformatics tools aid in analyzing and interpreting this complex data, while clinical applications of structural variant analysis continue to expand, impacting genetic testing and personalized medicine approaches.","overview":"## Introduction to Structural Variation\n- Structural variations (SVs) represent large-scale changes in the genome structure\n- Includes deletions, duplications, insertions, inversions, and translocations of DNA segments\n- SVs can range in size from 50 base pairs to several megabases\n- Play a significant role in genetic diversity and disease susceptibility\n- Contribute to phenotypic differences between individuals\n- SVs can disrupt gene function, alter gene dosage, or create fusion genes\n- Studying SVs helps understand genome evolution and disease mechanisms\n\n## Types of Structural Variants\n- Deletions involve the loss of a DNA segment from a chromosome\n - Can range from a few base pairs to large chromosomal regions\n - May result in loss of genetic material and altered gene function\n- Duplications occur when a DNA segment is copied one or more times\n - Leads to an increase in the number of copies of a particular gene or genomic region\n - Can potentially increase gene dosage and alter gene expression levels\n- Insertions introduce additional DNA sequences into a chromosome\n - Can originate from the same chromosome or a different chromosome\n - May disrupt gene function or create novel fusion genes\n- Inversions happen when a DNA segment is flipped 180 degrees within a chromosome\n - Can alter gene orientation and disrupt regulatory elements\n - May lead to changes in gene expression patterns\n- Translocations involve the exchange of DNA segments between non-homologous chromosomes\n - Can create fusion genes or disrupt gene function at the breakpoints\n - Balanced translocations do not result in net gain or loss of genetic material\n - Unbalanced translocations lead to gain or loss of genetic material\n\n## Copy Number Variations (CNVs)\n- CNVs are a type of structural variation involving changes in the number of copies of a particular DNA segment\n- Can include deletions (fewer copies) or duplications (more copies) of a genomic region\n- CNVs can range in size from a few kilobases to several megabases\n- Contribute significantly to genetic diversity and disease susceptibility\n- Can influence gene dosage and alter gene expression levels\n- Some CNVs are associated with specific genetic disorders (Charcot-Marie-Tooth disease)\n- CNVs can also be benign and present in healthy individuals\n- Studying CNVs helps understand the genetic basis of complex traits and diseases\n\n## Detection Methods for Structural Variants\n- Microarray-based methods detect SVs by comparing DNA hybridization patterns\n - Comparative genomic hybridization (CGH) arrays compare test and reference samples\n - Single nucleotide polymorphism (SNP) arrays identify SVs based on SNP genotypes\n- Next-generation sequencing (NGS) technologies enable high-resolution SV detection\n - Whole-genome sequencing (WGS) provides comprehensive coverage of the entire genome\n - Targeted sequencing focuses on specific genomic regions of interest\n- Read-depth analysis infers copy number changes based on the depth of sequencing reads\n- Split-read mapping identifies SVs by aligning reads spanning SV breakpoints\n- Paired-end mapping detects SVs by analyzing discordant read pairs\n- Long-read sequencing technologies (PacBio, Oxford Nanopore) improve SV detection accuracy\n- Optical mapping generates high-resolution physical maps to identify large-scale SVs\n\n## Bioinformatics Tools for SV Analysis\n- Alignment tools map sequencing reads to a reference genome (BWA, Bowtie)\n- SV callers identify SVs from aligned sequencing data (Manta, Lumpy, BreakDancer)\n- CNV detection tools analyze read depth and identify copy number changes (CNVnator, FREEC)\n- Annotation tools provide functional and clinical interpretation of SVs (ANNOVAR, VEP)\n- Visualization tools display SV data and facilitate interpretation (IGV, Circos)\n- Data management and integration platforms handle large-scale SV datasets (Galaxy, DNAnexus)\n- Quality control tools assess the quality and reliability of SV calls (SVQual, SVScore)\n- Benchmarking and validation datasets help evaluate the performance of SV detection methods\n\n## Interpreting Structural Variant Data\n- Assess the quality and reliability of SV calls using quality metrics and filtering criteria\n- Annotate SVs with functional and clinical information using databases and annotation tools\n- Determine the potential impact of SVs on gene function and disease risk\n- Consider the frequency and population distribution of SVs using population databases (gnomAD)\n- Evaluate the inheritance pattern and segregation of SVs in families\n- Integrate SV data with other types of genomic data (gene expression, epigenetic modifications)\n- Validate SV calls using orthogonal methods (PCR, Sanger sequencing) for critical findings\n- Interpret SVs in the context of the individual's clinical presentation and family history\n\n## Clinical Implications and Applications\n- SVs can contribute to the development of various genetic disorders and complex diseases\n- Deletions and duplications can cause genomic disorders (DiGeorge syndrome, Williams syndrome)\n- SVs can disrupt tumor suppressor genes or activate oncogenes in cancer\n- CNVs are associated with neurodevelopmental disorders (autism, schizophrenia)\n- SVs can influence pharmacogenomic traits and drug response\n- SV analysis is important in prenatal and postnatal genetic testing\n- SVs can be used as diagnostic and prognostic biomarkers for certain diseases\n- Understanding SVs helps guide personalized treatment and management strategies\n\n## Challenges and Future Directions\n- Improving the accuracy and sensitivity of SV detection methods, particularly for complex SVs\n- Developing standardized protocols and guidelines for SV analysis and reporting\n- Integrating SV data with other omics data to gain a comprehensive understanding of disease mechanisms\n- Establishing large-scale SV databases and resources for research and clinical applications\n- Addressing the challenges of interpreting SVs in non-coding regions of the genome\n- Investigating the role of SVs in complex traits and common diseases\n- Developing targeted therapies and interventions based on SV information\n- Exploring the use of long-read sequencing technologies for improved SV detection and characterization","active":true,"order":7,"meta":{"title":"Structural Variation & Copy Number Analysis | Computational Genomics Class Notes","description":"Study guides to review Structural Variation & Copy Number Analysis. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"e5JIMOVQQkzNL0oM","type":"STUDY_GUIDE","title":"7.1 Types of structural variations","slug":"types-structural-variations","date":null,"keyTopics":[],"publicId":"e5JIMOVQQkzNL0oM","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["jOuboSE5fApInCvh"],"duration":12},{"id":"gkFxQKfvS4gWMJ46","type":"STUDY_GUIDE","title":"7.2 Copy number variations (CNVs)","slug":"copy-number-variations-cnvs","date":null,"keyTopics":[],"publicId":"gkFxQKfvS4gWMJ46","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["GU87Rj8DnJbwMxbR"],"duration":11},{"id":"MRB2RHP9LzYIr6tf","type":"STUDY_GUIDE","title":"7.5 Structural variant detection methods","slug":"structural-variant-detection-methods","date":null,"keyTopics":[],"publicId":"MRB2RHP9LzYIr6tf","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["ockRvetdG9HdGQWH"],"duration":9},{"id":"S5X8J6owP4deK1KH","type":"STUDY_GUIDE","title":"7.4 Inversions and translocations","slug":"inversions-translocations","date":null,"keyTopics":[],"publicId":"S5X8J6owP4deK1KH","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["Eza1YOxBGsRfLx1K"],"duration":11},{"id":"ZyGtaVCCm6mPyMDA","type":"STUDY_GUIDE","title":"7.3 Insertions and deletions (indels)","slug":"insertions-deletions-indels","date":null,"keyTopics":[],"publicId":"ZyGtaVCCm6mPyMDA","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["ylWIwsj3VFsN53IB"],"duration":12}],"numResources":1},{"id":"KrICFmBMS0kFsqLb","name":"Unit 8 – Population Genomics and GWAS","emoji":"📚","slug":"unit-8","description":"Unit 8: Population Genomics and Genome-Wide Association Studies","intro":"Population genomics examines genetic variation within and between populations to understand evolutionary processes and population structure. It explores concepts like genetic diversity, drift, selection, and gene flow, using tools such as linkage disequilibrium and Hardy-Weinberg equilibrium.\n\nGenome-wide association studies (GWAS) identify genetic variants linked to traits or diseases in populations. GWAS uses case-control designs, genotyping arrays, and statistical methods to uncover associations, considering factors like population structure and multiple testing correction.","overview":"## Key Concepts in Population Genomics\n- Population genomics studies genetic variation within and between populations to understand evolutionary processes and population structure\n- Genetic diversity refers to the total number of genetic characteristics in the genetic makeup of a species\n- Genetic drift is the change in allele frequencies in a population due to random sampling of organisms\n- Natural selection is the process whereby organisms better adapted to their environment tend to survive and produce more offspring\n- Gene flow is the transfer of genetic variation from one population to another through migration or admixture\n- Linkage disequilibrium (LD) is the non-random association of alleles at different loci in a given population\n - LD can be influenced by factors such as population structure, selection, and recombination rates\n- Hardy-Weinberg equilibrium (HWE) is a state in which allele and genotype frequencies remain constant from generation to generation in the absence of evolutionary influences\n\n## Genetic Variation and Population Structure\n- Single nucleotide polymorphisms (SNPs) are the most common type of genetic variation used in population genomics studies\n- Copy number variations (CNVs) and insertions/deletions (indels) also contribute to genetic variation within populations\n- Population structure refers to the presence of genetically distinct subgroups within a population\n- Principal component analysis (PCA) is a statistical method used to visualize and assess population structure\n - PCA reduces high-dimensional genetic data into a smaller number of principal components that capture the majority of the variation\n- Admixture analysis estimates the proportions of an individual's genome that originate from different ancestral populations\n- F-statistics (Fst) measure the degree of genetic differentiation between populations\n - Fst values range from 0 (no differentiation) to 1 (complete differentiation)\n- Isolation by distance (IBD) is a pattern where genetic differences between populations increase with geographic distance due to limited gene flow\n\n## GWAS Fundamentals\n- Genome-wide association studies (GWAS) aim to identify genetic variants associated with traits or diseases in a population\n- GWAS typically use a case-control design, comparing allele frequencies between individuals with (cases) and without (controls) a specific phenotype\n- The common disease-common variant (CDCV) hypothesis suggests that common diseases are influenced by common genetic variants with small effect sizes\n- Genotyping arrays are used to simultaneously genotype hundreds of thousands to millions of SNPs across the genome\n- Imputation is the process of inferring unobserved genotypes based on reference panels and linkage disequilibrium patterns\n- Multiple testing correction is essential in GWAS to control for false positives due to the large number of statistical tests performed\n - Bonferroni correction and false discovery rate (FDR) are commonly used methods for multiple testing correction\n- Manhattan plots visualize GWAS results, with the negative logarithm of the p-value plotted against the genomic position of each SNP\n\n## Data Collection and Quality Control\n- Study design considerations for GWAS include sample size, case-control ratio, and population stratification\n- Genotyping quality control (QC) steps are crucial to ensure the accuracy and reliability of GWAS results\n- SNP QC measures include call rate, minor allele frequency (MAF), and Hardy-Weinberg equilibrium (HWE) testing\n - SNPs with low call rates, low MAF, or deviations from HWE are often excluded from analysis\n- Sample QC measures include individual call rate, heterozygosity, and relatedness checks\n - Samples with low call rates, extreme heterozygosity, or cryptic relatedness may be removed\n- Population stratification can lead to spurious associations and is often addressed using principal component analysis (PCA) or mixed models\n- Batch effects can arise from technical factors (genotyping platform, lab, or processing date) and should be identified and corrected\n- Phenotype data quality is equally important, with considerations for phenotype definition, measurement, and harmonization across studies\n\n## Statistical Methods in GWAS\n- Single-SNP association tests, such as the chi-square test or logistic regression, are used to assess the association between each SNP and the phenotype of interest\n- Linear regression is used for quantitative traits, while logistic regression is used for binary traits (case-control studies)\n- Covariates, such as age, sex, and principal components, can be included in the regression models to adjust for potential confounding factors\n- Mixed linear models (MLMs) are used to account for population structure and cryptic relatedness by incorporating a kinship matrix\n- Meta-analysis combines GWAS results from multiple studies to increase statistical power and identify robust associations\n - Fixed-effect and random-effect models are used depending on the heterogeneity of effect sizes across studies\n- Bayesian methods, such as Bayesian variable selection regression (BVSR), can be used to prioritize SNPs and estimate their effect sizes\n- Polygenic risk scores (PRS) aggregate the effects of multiple SNPs to predict an individual's risk for a specific trait or disease\n\n## Interpreting GWAS Results\n- Genome-wide significance threshold is typically set at $p < 5 \\times 10^{-8}$ to account for multiple testing in GWAS\n- Locus zoom plots visualize the association signals and linkage disequilibrium patterns in a specific genomic region\n- Functional annotation of GWAS hits involves integrating information from various sources (e.g., gene expression, epigenetics, and biological pathways) to understand their potential functional impact\n- Heritability estimates the proportion of phenotypic variance explained by genetic factors and can be calculated using GWAS summary statistics\n- Genetic correlation analysis assesses the shared genetic basis between different traits or diseases using GWAS summary statistics\n- Mendelian randomization uses genetic variants as instrumental variables to infer causal relationships between exposures and outcomes\n- Replication of GWAS findings in independent cohorts is essential to validate the associations and assess their generalizability\n\n## Challenges and Limitations\n- Missing heritability refers to the gap between the heritability estimates from family studies and the variance explained by GWAS-identified variants\n- Rare variants (MAF < 1%) are not well captured by standard GWAS genotyping arrays and may require sequencing-based approaches\n- Gene-environment interactions can modulate the effect of genetic variants on the phenotype but are often not accounted for in GWAS\n- Phenotypic heterogeneity, where different genetic variants contribute to different subtypes of a disease, can reduce the power of GWAS\n- Population-specific genetic effects may limit the transferability of GWAS findings across diverse populations\n- Biological interpretation of GWAS results can be challenging, as associated variants may not be the causal variants and may affect genes or regulatory elements distant from the SNP\n- Ethical considerations, such as informed consent, data privacy, and the potential for genetic discrimination, must be addressed in GWAS\n\n## Applications and Future Directions\n- Drug target discovery and repositioning: GWAS can identify novel therapeutic targets and suggest potential drug repurposing opportunities\n- Precision medicine: GWAS findings can inform personalized risk prediction, diagnosis, and treatment strategies\n- Integration of multi-omics data (transcriptomics, epigenomics, and proteomics) can provide a more comprehensive understanding of the biological mechanisms underlying GWAS associations\n- Fine-mapping and functional validation studies are necessary to pinpoint the causal variants and elucidate their functional consequences\n- Transethnic GWAS and meta-analyses can improve the power to detect associations and assess the generalizability of findings across diverse populations\n- Polygenic risk scores (PRS) have the potential to improve risk stratification and targeted interventions, but their clinical utility and ethical implications need to be carefully considered\n- Machine learning and artificial intelligence approaches can be applied to GWAS data to improve risk prediction, identify novel associations, and uncover complex genetic architectures\n- Collaboration and data sharing among researchers, institutions, and countries are crucial to accelerate progress in GWAS and translate the findings into tangible benefits for human health","active":true,"order":8,"meta":{"title":"Population Genomics and GWAS | Computational Genomics Class Notes","description":"Study guides to review Population Genomics and GWAS. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"6bVVfn9ALT9xth8e","type":"STUDY_GUIDE","title":"8.1 Hardy-Weinberg equilibrium","slug":"hardy-weinberg-equilibrium","date":null,"keyTopics":[],"publicId":"6bVVfn9ALT9xth8e","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["xLT1mPuxso3xnair"],"duration":8},{"id":"a2UOEyrGtO4ozqdW","type":"STUDY_GUIDE","title":"8.2 Linkage disequilibrium","slug":"linkage-disequilibrium","date":null,"keyTopics":[],"publicId":"a2UOEyrGtO4ozqdW","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["HtdqzIUX0CE9SHa6"],"duration":9},{"id":"NCBju8eKM0scpnqn","type":"STUDY_GUIDE","title":"8.3 Population structure and admixture","slug":"population-structure-admixture","date":null,"keyTopics":[],"publicId":"NCBju8eKM0scpnqn","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["kTN9eazn8FJ7sY3F"],"duration":7},{"id":"Tv1TlSsbiCJ2AUVz","type":"STUDY_GUIDE","title":"8.4 Genome-wide association studies (GWAS)","slug":"genome-wide-association-studies-gwas","date":null,"keyTopics":[],"publicId":"Tv1TlSsbiCJ2AUVz","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["x4Y2VCR63WuiwAkV"],"duration":7},{"id":"VhpmcTUjYX7PqlVY","type":"STUDY_GUIDE","title":"8.5 Genotype imputation","slug":"genotype-imputation","date":null,"keyTopics":[],"publicId":"VhpmcTUjYX7PqlVY","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["w1CA7TtOEs2EbqlB"],"duration":7}],"numResources":1},{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","description":"Unit 9: Transcriptomics and RNA-seq Analysis","intro":"RNA-seq is a powerful tool for studying gene expression in cells and tissues. It allows researchers to quantify RNA levels, identify new transcripts, and detect alternative splicing events. This technique has revolutionized transcriptomics by enabling genome-wide analysis of gene expression at unprecedented depth and resolution.\n\nKey concepts in transcriptomics include the transcriptome, gene expression, alternative splicing, and non-coding RNAs. Understanding these concepts is crucial for designing RNA-seq experiments, analyzing data, and interpreting results. Proper experimental design, sample preparation, and quality control are essential for generating reliable RNA-seq data.","overview":"## What's RNA-seq?\n- RNA-seq (RNA sequencing) is a high-throughput sequencing technology used to study the transcriptome, which is the complete set of RNA transcripts in a cell or tissue at a specific time point\n- Enables researchers to quantify gene expression levels, identify novel transcripts, and discover alternative splicing events by directly sequencing the RNA molecules present in a sample\n- Provides a snapshot of the active genes and their expression levels under specific conditions (developmental stage, disease state, or treatment)\n- Offers several advantages over traditional gene expression profiling methods (microarrays) including higher sensitivity, wider dynamic range, and the ability to detect novel transcripts without prior knowledge of the genome sequence\n- Generates large amounts of data that require computational analysis to extract biologically meaningful insights\n - Typical RNA-seq experiment produces millions of short reads (50-150 base pairs) that need to be aligned to a reference genome or assembled de novo\n - Expression levels are quantified by counting the number of reads mapping to each gene or transcript\n- Has revolutionized the field of transcriptomics by enabling genome-wide analysis of gene expression at an unprecedented resolution and depth\n- Widely applied in various research areas (developmental biology, cancer research, and plant sciences) to understand the molecular mechanisms underlying biological processes and diseases\n\n## Key Concepts in Transcriptomics\n- Transcriptome refers to the complete set of RNA transcripts present in a cell or tissue at a given time point, including messenger RNAs (mRNAs), non-coding RNAs (ncRNAs), and small RNAs\n- Gene expression is the process by which the information encoded in a gene is used to synthesize functional gene products, primarily proteins\n - Expression levels can be quantified by measuring the abundance of mRNA transcripts produced from each gene\n - Differential gene expression refers to the changes in expression levels between different conditions (e.g., healthy vs. diseased, treated vs. untreated)\n- Alternative splicing is a regulated process during gene expression that allows a single gene to produce multiple mRNA isoforms, potentially encoding different protein variants\n - Occurs when exons are included or excluded from the final processed mRNA, or when introns are retained\n - Contributes to the diversity of the proteome and plays a crucial role in cell differentiation, development, and disease\n- Non-coding RNAs (ncRNAs) are functional RNA molecules that are not translated into proteins, but have important regulatory roles in gene expression and cellular processes\n - Examples include long non-coding RNAs (lncRNAs), microRNAs (miRNAs), and small interfering RNAs (siRNAs)\n- RNA editing is a post-transcriptional modification that alters the nucleotide sequence of an RNA molecule, potentially changing the amino acid sequence of the encoded protein or affecting the stability and localization of the RNA\n- Gene fusion occurs when two separate genes are joined together due to chromosomal rearrangements, resulting in the production of a chimeric RNA and potentially a fusion protein with altered function\n - Often associated with cancer development and can serve as diagnostic biomarkers or therapeutic targets\n\n## RNA-seq Experimental Design\n- Defining the research question and hypothesis is crucial for designing an appropriate RNA-seq experiment\n - Clearly state the biological question to be addressed and the specific hypotheses to be tested\n - Consider the type of samples to be analyzed (e.g., tissues, cell lines, or single cells) and the comparisons to be made (e.g., treatment vs. control, time course, or different developmental stages)\n- Determining the sequencing depth and read length depends on the research goals and the complexity of the transcriptome\n - Higher sequencing depth (more reads per sample) increases the sensitivity to detect low-abundance transcripts and rare isoforms\n - Longer read lengths (100-150 bp) improve the accuracy of transcript assembly and isoform identification\n - Balance between sequencing depth and number of biological replicates based on budget and experimental design\n- Selecting the appropriate number of biological replicates is essential for statistical power and reproducibility\n - Biological replicates (independent samples from different individuals or experiments) capture the biological variability and allow for robust differential expression analysis\n - At least three biological replicates per condition are recommended, with more replicates increasing the power to detect significant differences\n- Choosing the RNA extraction and library preparation methods based on the sample type and research objectives\n - Total RNA sequencing captures all RNA species, including mRNAs, ncRNAs, and small RNAs\n - mRNA sequencing (poly(A) selection) enriches for mature mRNAs by targeting the poly(A) tail\n - Ribosomal RNA (rRNA) depletion removes the highly abundant rRNA molecules to increase the coverage of other RNA species\n - Strand-specific protocols preserve the information about the originating strand of the RNA transcripts\n- Planning for data storage, management, and analysis infrastructure is important given the large volume of data generated by RNA-seq experiments\n - Raw sequencing data (FASTQ files) and processed data (aligned reads, count matrices) require significant storage capacity\n - Computational resources (high-performance computing clusters or cloud-based platforms) are necessary for data processing and analysis\n - Establish a data management plan for organizing, backing up, and sharing the data in accordance with FAIR (Findable, Accessible, Interoperable, and Reusable) principles\n\n## Sample Prep and Sequencing\n- RNA extraction is the first step in sample preparation, which involves isolating total RNA from the biological samples\n - Use commercially available kits (TRIzol, RNeasy) or phenol-chloroform extraction methods\n - Assess RNA quality and integrity using spectrophotometry (NanoDrop) and capillary electrophoresis (Bioanalyzer or TapeStation)\n - High-quality RNA with minimal degradation (RIN > 8) is essential for successful library preparation and sequencing\n- Library preparation converts the RNA molecules into cDNA libraries compatible with the sequencing platform\n - Fragmentation of RNA into smaller pieces (200-500 bp) to ensure uniform coverage across the transcriptome\n - Reverse transcription to synthesize cDNA from the fragmented RNA using random hexamer primers\n - Adapter ligation to attach platform-specific sequences to the ends of the cDNA fragments, enabling amplification and sequencing\n - Amplification of the cDNA library using PCR to increase the amount of material for sequencing\n - Size selection to enrich for fragments of the desired length and remove adapter dimers and other artifacts\n- Multiplexing allows for the simultaneous sequencing of multiple samples in a single run by using unique barcodes (sample-specific sequences) added during library preparation\n - Reduces sequencing costs and increases throughput\n - Requires careful design to ensure balanced representation of samples and avoid barcode crosstalk\n- Sequencing platforms (Illumina, PacBio, Oxford Nanopore) generate millions to billions of short reads (50-150 bp) or longer reads (1-100 kb) from the cDNA libraries\n - Illumina sequencing (HiSeq, NextSeq) is the most widely used platform for RNA-seq, offering high accuracy, throughput, and cost-effectiveness\n - PacBio and Oxford Nanopore sequencing provide longer reads that can improve the resolution of complex isoforms and splice variants, but have higher error rates and lower throughput compared to Illumina\n- Sequencing depth and read length should be chosen based on the research objectives and the complexity of the transcriptome\n - Aim for at least 20-30 million reads per sample for differential expression analysis of coding genes\n - Increase sequencing depth (50-100 million reads) for detecting low-abundance transcripts, non-coding RNAs, or rare isoforms\n - Longer read lengths (100-150 bp) improve the accuracy of transcript assembly and isoform identification\n\n## Quality Control and Preprocessing\n- Quality assessment of raw sequencing data (FASTQ files) is crucial for identifying and addressing any issues that may affect downstream analysis\n - Use tools like FastQC or MultiQC to generate quality control reports\n - Check for base quality scores, GC content, sequence duplication levels, and overrepresented sequences (adapters, primers)\n - Low-quality bases (Q < 20) and adapter sequences should be trimmed using tools like Trimmomatic or Cutadapt to improve the accuracy of alignment and quantification\n- Filtering out low-quality reads and contaminants helps to reduce noise and improve the signal-to-noise ratio in the data\n - Remove reads with a high proportion of low-quality bases (e.g., >50% bases with Q < 20)\n - Discard reads aligning to ribosomal RNA (rRNA) or other contaminating sequences (e.g., PhiX control) using tools like SortMeRNA or Bowtie2\n - Trim or filter out reads with adapter sequences, poly(A) tails, or other technical artifacts\n- Read trimming involves removing low-quality bases and adapter sequences from the ends of the reads\n - Performed using tools like Trimmomatic, Cutadapt, or BBDuk\n - Improves the accuracy of alignment and quantification by ensuring that only high-quality bases are used in the analysis\n - Trims bases below a specified quality threshold (e.g., Q < 20) from the 3' end of the reads\n - Removes adapter sequences by matching the read sequences against a library of known adapter sequences\n- Read deduplication identifies and removes PCR duplicates, which are reads originating from the same cDNA fragment during library amplification\n - PCR duplicates can introduce biases in quantification and lead to overestimation of expression levels\n - Tools like Picard MarkDuplicates or Clumpify can be used to identify and remove duplicate reads based on their alignment positions\n - Deduplication is more important for low-complexity libraries (e.g., small RNA-seq) or when using PCR-based library preparation methods\n- Quality control metrics and thresholds should be carefully evaluated and reported to ensure the reproducibility and reliability of the results\n - Base quality scores: Aim for a median Q-score > 30 and no more than 20% bases with Q < 20\n - Adapter content: Less than 10% of reads should contain adapter sequences after trimming\n - rRNA contamination: Less than 5% of reads should align to rRNA sequences\n - Alignment rate: At least 70-80% of reads should align uniquely to the reference genome or transcriptome\n - Read duplication rate: Depends on the library complexity and preparation method, but should be consistent across samples\n\n## Alignment and Quantification\n- Alignment maps the preprocessed reads to a reference genome or transcriptome to determine their originating locations\n - Spliced aligners (STAR, HISAT2, TopHat2) are used to handle reads spanning exon-exon junctions\n - Alignment parameters (e.g., mismatch rate, gap penalties) should be optimized based on the read length and quality\n - Alignment quality metrics (e.g., uniquely mapped reads, multi-mapped reads, unmapped reads) should be evaluated to assess the quality of the alignment\n- Transcript assembly reconstructs the full-length transcripts from the aligned reads, allowing for the identification of novel isoforms and splice variants\n - Reference-guided assembly (Cufflinks, StringTie) uses the reference genome annotation to guide the assembly process\n - De novo assembly (Trinity, Oases) reconstructs transcripts without relying on a reference genome, enabling the discovery of novel transcripts in non-model organisms\n - Hybrid assembly approaches (HISAT-StringTie, STAR-Cufflinks) combine reference-guided and de novo methods to improve the accuracy and completeness of the assembly\n- Quantification estimates the expression levels of genes and transcripts by counting the number of reads or fragments mapping to each feature\n - Count-based methods (HTSeq, featureCounts) assign reads to genes or exons based on their alignment positions\n - Requires a gene annotation file (GTF/GFF) to define the genomic coordinates of the features\n - Generates a count matrix with rows representing genes and columns representing samples\n - Transcript-level quantification (RSEM, Kallisto, Salmon) estimates the abundance of individual isoforms by probabilistically assigning reads to transcripts\n - Uses the principles of pseudoalignment or quasi-mapping to rapidly assign reads to transcripts without the need for a full alignment\n - Outputs TPM (Transcripts Per Million) or FPKM (Fragments Per Kilobase Million) values, which normalize for transcript length and library size\n- Normalization adjusts the raw read counts to account for differences in library size, sequencing depth, and other technical factors that may affect the comparison of expression levels across samples\n - CPM (Counts Per Million) and RPM (Reads Per Million) normalize the read counts by the total number of mapped reads in each sample\n - TPM (Transcripts Per Million) and FPKM (Fragments Per Kilobase Million) additionally normalize for transcript length, making them more suitable for comparing expression levels across genes\n - TMM (Trimmed Mean of M-values) and DESeq2's median-of-ratios method are more robust normalization methods that account for differences in library composition and reduce the impact of highly expressed genes\n- Batch effect correction removes systematic biases introduced by technical factors (e.g., sequencing run, library preparation batch) that can confound the biological variation of interest\n - Tools like ComBat, SVA, and RUVSeq can be used to identify and correct for batch effects using statistical methods\n - Batch effects should be assessed and corrected for before downstream analysis to avoid false positives and improve the reproducibility of the results\n\n## Differential Expression Analysis\n- Differential expression analysis identifies genes or transcripts that are significantly up- or down-regulated between experimental conditions\n - Requires a count matrix with normalized read counts for each gene or transcript across all samples\n - Compares the expression levels between two or more groups (e.g., treatment vs. control, time points, tissue types) to identify genes with statistically significant changes in expression\n- Statistical methods for differential expression analysis model the distribution of read counts and test for significant differences between conditions\n - Negative binomial distribution (DESeq2, edgeR) is commonly used to model the overdispersion of read counts, accounting for both biological and technical variability\n - Generalized linear models (GLMs) are used to test for differential expression while controlling for confounding factors (e.g., batch effects, covariates)\n - Likelihood ratio tests (LRT) or Wald tests are used to assess the significance of the differential expression, generating p-values for each gene or transcript\n- Multiple testing correction adjusts the p-values to control for the false discovery rate (FDR) when performing numerous simultaneous hypothesis tests\n - Bonferroni correction is a conservative method that multiplies the p-values by the number of tests performed, ensuring a family-wise error rate (FWER) of less than the specified threshold (e.g., 0.05)\n - Benjamini-Hochberg procedure is a more powerful method that controls the FDR, which is the expected proportion of false positives among all significant results\n - Adjusted p-values (q-values) < 0.05 are typically considered significant, but the threshold can be adjusted based on the desired balance between sensitivity and specificity\n- Fold change cutoffs are often used in combination with statistical significance to identify biologically meaningful changes in expression\n - Fold change (FC) is calculated as the ratio of the average normalized expression levels between two conditions (e.g., treatment/control)\n - Log2 fold change (LFC) is the logarithm (base 2) of the fold change, with positive values indicating up-regulation and negative values indicating down-regulation\n - Commonly used fold change cutoffs are |FC| > 2 or |LFC| > 1, but the choice of threshold depends on the biological context and the desired level of stringency\n- Visualization of differential expression results helps to interpret and communicate the findings\n - Volcano plots display the statistical significance (-log10 p-value) against the fold change (log2 FC) for each gene, highlighting genes that are both statistically significant and biologically meaningful\n - Heatmaps cluster genes and samples based on their expression patterns, revealing groups of co-regulated genes and sample relationships\n - MA plots (M-A plots) show the average expression level (A) against the fold change (M) for each gene, helping to identify intensity-dependent biases and assess the overall distribution of differential expression\n\n## Functional Interpretation\n- Gene set enrichment analysis (GS","active":true,"order":9,"meta":{"title":"RNA-seq: Transcriptome Analysis | Computational Genomics Class Notes","description":"Study guides to review RNA-seq: Transcriptome Analysis. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"EmX26AxpxQtHCk2T","type":"STUDY_GUIDE","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","date":null,"keyTopics":[],"publicId":"EmX26AxpxQtHCk2T","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["5Bx3HMnMEh8FGIwo"],"duration":11},{"id":"Sgbz2v78n5w6Jpvf","type":"STUDY_GUIDE","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","date":null,"keyTopics":[],"publicId":"Sgbz2v78n5w6Jpvf","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["1UYjIcOGfFz9BTxr"],"duration":12},{"id":"XgFJq4aaYf0bUT1s","type":"STUDY_GUIDE","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","date":null,"keyTopics":[],"publicId":"XgFJq4aaYf0bUT1s","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["Oyn4QsZ7wbM7upzy"],"duration":8},{"id":"YaOPd55WlPDCFUlp","type":"STUDY_GUIDE","title":"9.3 Differential gene expression","slug":"differential-gene-expression","date":null,"keyTopics":[],"publicId":"YaOPd55WlPDCFUlp","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["Ixh9gEuKk3jFLbVE"],"duration":10},{"id":"6GSEt5qoBER92OkV","type":"STUDY_GUIDE","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","date":null,"keyTopics":[],"publicId":"6GSEt5qoBER92OkV","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["ALySWGgX9MQUg44N"],"duration":13}],"numResources":1},{"id":"2n1nV5ALPeAKCNnZ","name":"Unit 10 – Metagenomics & Microbiome Analysis","emoji":"📚","slug":"unit-10","description":"Unit 10: Metagenomics and Microbiome Analysis","intro":"Metagenomics and microbiome analysis unlock the hidden world of microbial communities. By studying collective genomes in environmental samples, researchers gain insights into microbial diversity, functions, and interactions. This field combines advanced sequencing technologies with sophisticated bioinformatics tools to decipher complex microbial ecosystems.\n\nFrom sampling techniques to data analysis, metagenomics offers a comprehensive view of microbiomes. Key concepts like alpha and beta diversity, OTUs, and ASVs help quantify microbial composition. Functional analysis reveals the metabolic potential of these communities, while applications span human health, environmental monitoring, and biotechnology.","overview":"## Key Concepts and Definitions\n- Metagenomics involves studying the collective genomes of microorganisms in an environmental sample\n- Microbiome refers to the entire community of microbes within a specific environment (gut, soil, ocean)\n- Amplicon sequencing targets specific genetic markers (16S rRNA gene) to identify microbial taxa present\n- Shotgun metagenomics sequences all DNA in a sample provides insights into microbial functions and interactions\n- Alpha diversity measures the richness and evenness of microbial communities within a single sample\n - Richness refers to the number of unique taxa present\n - Evenness describes how evenly the taxa are distributed\n- Beta diversity assesses the differences in microbial composition between samples or environments\n- Operational Taxonomic Units (OTUs) cluster sequences based on similarity thresholds (97%) to define microbial taxa\n- Amplicon Sequence Variants (ASVs) offer higher resolution than OTUs by distinguishing sequences differing by a single nucleotide\n\n## Microbiome Sampling Techniques\n- Sample collection methods vary depending on the environment (swabs, fecal samples, water filters)\n- Aseptic techniques prevent contamination during sample collection and processing\n- Sample storage conditions (temperature, preservatives) maintain DNA integrity for downstream analysis\n- Negative controls assess potential contamination introduced during sample processing\n- Metadata collection (sample type, location, host information) provides context for data interpretation\n- DNA extraction protocols optimize yield and purity while minimizing bias\n - Mechanical lysis (bead beating) disrupts tough microbial cell walls\n - Enzymatic lysis (lysozyme) digests cell wall components\n- Quality control steps (gel electrophoresis, spectrophotometry) evaluate DNA quantity and purity before sequencing\n\n## DNA Sequencing for Metagenomics\n- High-throughput sequencing technologies (Illumina, PacBio, Oxford Nanopore) generate millions of reads per sample\n- 16S rRNA gene sequencing targets conserved and variable regions to identify bacteria and archaea\n - V3-V4 regions are commonly used for their taxonomic resolution\n - Primers designed to minimize amplification bias and maximize coverage\n- Shotgun metagenomics provides an unbiased view of the entire microbial community\n - Fragmented DNA is sequenced without prior amplification\n - Allows for the discovery of novel genes and pathways\n- Sequencing depth and coverage impact the ability to detect rare taxa and functions\n- Paired-end sequencing improves assembly and resolves repetitive regions\n- Multiplexing allows multiple samples to be sequenced simultaneously using unique barcodes\n- Quality control metrics (Q scores, read length, GC content) assess sequencing performance\n\n## Bioinformatics Tools and Pipelines\n- Quality filtering removes low-quality reads and trims adapters to improve downstream analysis\n - Tools: Trimmomatic, FastQC, PRINSEQ\n- Sequence assembly reconstructs genomes and metagenomes from short reads\n - De novo assembly (MEGAHIT, SPAdes) does not require a reference genome\n - Reference-based assembly (Bowtie2, BWA) maps reads to known genomes\n- Chimera detection identifies and removes artificially combined sequences\n - Tools: UCHIME, VSEARCH\n- Sequence clustering groups similar reads into OTUs or ASVs\n - Tools: QIIME, Mothur, DADA2\n- Taxonomic assignment matches sequences to reference databases (SILVA, Greengenes, RDP)\n - Naive Bayes classifiers (RDP Classifier) assign taxonomy based on k-mer composition\n - Sequence alignment tools (BLAST) identify closest matches in databases\n- Gene prediction and annotation identify functional potential of microbiomes\n - Tools: Prodigal, MetaGeneMark, KEGG, COG\n\n## Data Analysis and Visualization\n- Rarefaction curves assess sequencing depth and species richness\n- Alpha diversity metrics (Chao1, Shannon, Simpson) quantify within-sample diversity\n - Plotted using box plots or bar charts to compare groups\n- Beta diversity metrics (Bray-Curtis, UniFrac) measure between-sample differences\n - Visualized using Principal Coordinate Analysis (PCoA) or Non-Metric Multidimensional Scaling (NMDS)\n- Heatmaps display relative abundances of taxa across samples\n- Stacked bar plots show taxonomic composition at different levels (phylum, genus, species)\n- Correlation analyses (Spearman, Pearson) identify associations between microbial taxa and metadata\n- Statistical tests (ANOVA, PERMANOVA) assess significant differences between groups\n- Machine learning methods (Random Forests, Support Vector Machines) predict sample categories based on microbiome profiles\n\n## Taxonomic Classification Methods\n- Sequence similarity-based methods compare query sequences to reference databases\n - Best BLAST hit assigns taxonomy based on the top alignment score\n - Lowest common ancestor (LCA) algorithm (MEGAN) assigns shared taxonomy among multiple hits\n- Composition-based methods use k-mer frequencies and machine learning to classify sequences\n - Naive Bayes classifier (RDP Classifier) calculates posterior probabilities for each taxonomic rank\n - k-Nearest Neighbors (k-NN) assigns taxonomy based on the majority vote of the k most similar sequences\n- Phylogenetic placement methods insert query sequences into reference phylogenetic trees\n - Evolutionary placement algorithm (EPA) in RAxML places sequences based on maximum likelihood\n - pplacer uses Bayesian posterior probability to place sequences on a reference tree\n- Marker gene-based methods rely on single-copy, evolutionarily conserved genes\n - MetaPhlAn2 uses clade-specific marker genes to estimate relative abundances\n - mOTU uses marker genes to profile taxonomic composition and functional potential\n\n## Functional Analysis of Microbiomes\n- Gene prediction identifies open reading frames (ORFs) in assembled contigs\n - Tools: Prodigal, MetaGeneMark, FragGeneScan\n- Functional annotation assigns predicted genes to functional categories\n - Databases: KEGG, COG, eggNOG, Pfam\n - Tools: BLAST, DIAMOND, InterProScan\n- Pathway analysis maps annotated genes to metabolic pathways\n - Tools: KEGG Mapper, MetaCyc, HUMAnN2\n- Comparative analysis identifies differentially abundant functions between groups\n - Tools: LEfSe, DESeq2, edgeR\n- Metatranscriptomics assesses active gene expression in microbiomes\n - RNA sequencing (RNA-seq) quantifies transcript abundances\n - Differential expression analysis identifies genes responding to environmental changes\n- Metaproteomics characterizes the functional activity of microbiomes at the protein level\n - Mass spectrometry identifies and quantifies proteins\n - Protein-protein interaction networks reveal functional associations\n\n## Applications and Case Studies\n- Human gut microbiome studies link dysbiosis to diseases (obesity, inflammatory bowel disease, diabetes)\n - Fecal microbiota transplantation (FMT) is used to treat recurrent Clostridium difficile infection\n - Probiotics and prebiotics modulate the gut microbiome for therapeutic purposes\n- Environmental microbiome studies assess biodiversity and monitor ecosystem health\n - Soil microbiomes influence plant growth and nutrient cycling\n - Marine microbiomes play crucial roles in global biogeochemical cycles\n- Bioremediation uses microbial communities to degrade pollutants and clean up contaminated sites\n - Metagenomics identifies key microbial taxa and genes involved in bioremediation processes\n - Monitoring microbiome shifts helps optimize bioremediation strategies\n- Agriculture applies microbiome knowledge to improve crop yields and disease resistance\n - Plant growth-promoting rhizobacteria (PGPR) enhance nutrient uptake and stress tolerance\n - Microbiome-based biocontrol agents suppress plant pathogens\n- Personalized medicine leverages microbiome data for precision treatments\n - Microbiome-based biomarkers predict disease risk and treatment response\n - Targeted modulation of the microbiome through diet, probiotics, and fecal transplants","active":true,"order":10,"meta":{"title":"Metagenomics & Microbiome Analysis | Computational Genomics Class Notes","description":"Study guides to review Metagenomics & Microbiome Analysis. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"qV6j2t8Cf7jqK5FF","type":"STUDY_GUIDE","title":"10.1 Microbial community profiling","slug":"microbial-community-profiling","date":null,"keyTopics":[],"publicId":"qV6j2t8Cf7jqK5FF","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["jvs3mMCnQcrWd09u"],"duration":9},{"id":"bWBJWQK6rb4vvl4u","type":"STUDY_GUIDE","title":"10.2 16S rRNA sequencing","slug":"16s-rrna-sequencing","date":null,"keyTopics":[],"publicId":"bWBJWQK6rb4vvl4u","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["tv1Hko9LuSAY0kkS"],"duration":10},{"id":"AbgVUgkpA1WG2yZN","type":"STUDY_GUIDE","title":"10.5 Functional analysis of microbial communities","slug":"functional-analysis-microbial-communities","date":null,"keyTopics":[],"publicId":"AbgVUgkpA1WG2yZN","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["CMLdakx03EShE6t3"],"duration":9},{"id":"ztztgVMcPcc2gahc","type":"STUDY_GUIDE","title":"10.3 Shotgun metagenomics","slug":"shotgun-metagenomics","date":null,"keyTopics":[],"publicId":"ztztgVMcPcc2gahc","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["9UJ4wqhkMHYN4mj7"],"duration":6},{"id":"W1ZiaVupDeBpgxm2","type":"STUDY_GUIDE","title":"10.4 Metagenome assembly and binning","slug":"metagenome-assembly-binning","date":null,"keyTopics":[],"publicId":"W1ZiaVupDeBpgxm2","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["EWU3Ok2euhNCm9S3"],"duration":6}],"numResources":1},{"id":"Rkw1WjdHC2dHRhE9","name":"Unit 11 – Genomic Data Visualization & Analysis","emoji":"📚","slug":"unit-11","description":"Unit 11: Genomic Data Visualization and Interpretation","intro":"Genomic data visualization and analysis are crucial for understanding complex biological information. These techniques allow researchers to interpret vast amounts of genomic data, identify patterns, and draw meaningful conclusions about gene function, disease mechanisms, and evolutionary relationships.\n\nFrom key concepts like sequencing and variant calling to advanced techniques like deep learning and spatial transcriptomics, this field combines biology, statistics, and computer science. Practical applications range from cancer genomics to agricultural improvements, showcasing the broad impact of genomic analysis on science and society.","overview":"## Key Concepts and Terminology\n- Genomics focuses on the structure, function, evolution, and mapping of genomes\n- Bioinformatics combines biology, computer science, and statistics to analyze and interpret biological data\n- Sequencing technologies (Illumina, PacBio) enable reading the nucleotide sequence of DNA or RNA\n- Alignment maps sequencing reads to a reference genome to identify their genomic positions\n- Variant calling detects differences (SNPs, indels, CNVs) between sequencing data and a reference genome\n- Gene expression quantifies the abundance of RNA transcripts to study gene activity across conditions\n- Epigenetics investigates heritable changes in gene expression not caused by alterations in the DNA sequence (DNA methylation, histone modifications)\n- Functional annotation assigns biological functions to genomic elements based on experimental evidence or computational predictions\n\n## Data Types and Formats in Genomics\n- FASTA format represents nucleotide or amino acid sequences using single-letter codes preceded by a header line\n- FASTQ format stores both biological sequences and their corresponding quality scores, with each entry consisting of four lines\n- SAM (Sequence Alignment/Map) format stores alignment information of sequencing reads mapped to a reference genome\n - BAM (Binary Alignment/Map) is the binary, compressed version of SAM for efficient storage and processing\n - CRAM is a more compressed alternative to BAM, achieving better compression ratios\n- VCF (Variant Call Format) represents genetic variations (SNPs, indels, structural variants) across multiple samples\n- BED (Browser Extensible Data) format defines genomic regions or features using tab-delimited fields (chromosome, start, end, name, score, strand)\n- GFF (General Feature Format) and GTF (Gene Transfer Format) describe gene structures and annotations using tab-delimited fields\n- BigWig and BigBed are binary, indexed formats for efficient visualization and querying of continuous and discrete genomic data, respectively\n\n## Visualization Tools and Techniques\n- Genome browsers (UCSC Genome Browser, Ensembl, IGV) enable interactive exploration of genomic data by displaying various data tracks aligned to a reference genome\n- Heatmaps visualize patterns and relationships in genomic data matrices, with rows representing features (genes, samples) and columns representing conditions or samples, and colors indicating values\n- Principal Component Analysis (PCA) plots reduce high-dimensional genomic data to two or three dimensions, capturing the most significant sources of variation\n- Volcano plots combine statistical significance ($-log_{10}(p-value)$) and magnitude of change (log fold change) to identify differentially expressed genes or regions\n- Circos plots depict genomic rearrangements, chromatin interactions, or other genomic relationships in a circular layout\n- Network diagrams represent interactions or functional relationships between genes, proteins, or other biological entities\n- Pathway maps illustrate the involvement of genes or proteins in biological processes or signaling cascades\n- Track-based visualizations (coverage plots, read alignments) help assess data quality, identify genomic features, and detect patterns or anomalies\n\n## Statistical Methods for Genomic Analysis\n- Hypothesis testing evaluates the statistical significance of observed differences or associations using p-values\n - Multiple testing correction (Bonferroni, FDR) adjusts p-values to control false positives when conducting numerous tests simultaneously\n- Differential expression analysis identifies genes with significant changes in expression levels between conditions using methods like DESeq2 or edgeR\n- Enrichment analysis assesses the overrepresentation of functional categories or pathways among a set of genes using tools like GSEA or DAVID\n- Clustering algorithms (hierarchical, k-means) group similar samples or genes based on their genomic profiles to discover patterns or subtypes\n- Machine learning techniques (classification, regression) build predictive models from genomic features to infer biological outcomes or traits\n- Survival analysis investigates the relationship between genomic variables and time-to-event outcomes using methods like Kaplan-Meier curves or Cox proportional hazards models\n- Bayesian inference incorporates prior knowledge and updates beliefs based on observed data to estimate posterior probabilities of genomic events or parameters\n\n## Data Preprocessing and Quality Control\n- Raw data processing converts sequencing machine outputs (BCL files) into readable formats (FASTQ) and performs initial quality checks\n- Quality assessment tools (FastQC, MultiQC) generate reports on sequencing data quality metrics (base quality scores, GC content, duplication rates)\n- Adapter trimming removes adapter sequences from reads to avoid alignment artifacts and improve mapping accuracy\n- Quality filtering removes low-quality reads or bases to enhance downstream analysis reliability\n- Contamination detection identifies and removes reads originating from non-target organisms (bacteria, viruses) to avoid biases\n- Read deduplication removes PCR duplicates to mitigate amplification biases and improve quantification accuracy\n- Batch effect correction normalizes data to minimize technical variations across samples or experiments\n- Data normalization scales raw read counts to account for differences in library sizes, sequencing depths, or other systematic biases\n\n## Exploratory Data Analysis in Genomics\n- Data visualization techniques (PCA, t-SNE, UMAP) help identify patterns, outliers, or batch effects in high-dimensional genomic datasets\n- Sample clustering groups samples based on their genomic profiles to discover subpopulations or disease subtypes\n- Correlation analysis assesses the strength and direction of relationships between genomic features or samples\n- Dimensionality reduction methods (PCA, NMF) extract key features or components that capture the most relevant information in the data\n- Unsupervised learning algorithms (clustering, anomaly detection) explore data structure and identify novel patterns without prior labels\n- Annotation enrichment analysis identifies overrepresented functional categories, pathways, or motifs in a set of genomic features\n- Integrative analysis combines multiple data types (gene expression, epigenetics, proteomics) to gain a more comprehensive understanding of biological systems\n\n## Advanced Analysis Techniques\n- Deep learning models (CNNs, RNNs) learn complex patterns and representations from genomic sequences or features for tasks like variant calling, gene expression prediction, or disease classification\n- Graph-based methods represent genomic data as networks to study relationships, interactions, or community structures\n- Topological data analysis captures higher-order interactions and structures in genomic datasets using techniques like persistent homology\n- Causal inference methods (Mendelian randomization, mediation analysis) infer causal relationships between genomic variables and phenotypes\n- Spatial transcriptomics combines gene expression profiling with spatial information to study tissue heterogeneity and cellular interactions\n- Single-cell genomics analyzes individual cells to uncover cellular diversity, lineage relationships, and rare cell types\n- Metagenomics studies the collective genomes of microbial communities to understand their composition, function, and interactions with the environment or host\n- Multi-omics integration combines data from different molecular levels (genome, transcriptome, proteome, metabolome) to obtain a systems-level understanding of biological processes\n\n## Practical Applications and Case Studies\n- Cancer genomics identifies driver mutations, molecular subtypes, and therapeutic targets by analyzing tumor genomes and transcriptomes\n- Precision medicine tailors treatments to individual patients based on their genomic profiles and other molecular characteristics\n- Genetic association studies (GWAS) identify genetic variants associated with complex traits or diseases by comparing allele frequencies between cases and controls\n- Pharmacogenomics investigates how genetic variations influence drug response and guides personalized medication choices\n- Agricultural genomics applies genomic techniques to improve crop yields, resistance to stresses, and nutritional quality\n- Evolutionary genomics studies the evolution of genomes across species to understand the mechanisms of adaptation, speciation, and phylogenetic relationships\n- Forensic genomics uses DNA evidence to identify individuals, establish kinship, or solve crimes\n- Microbiome analysis characterizes the composition and function of microbial communities in different environments (gut, soil, water) and their impact on health or ecosystem processes","active":true,"order":11,"meta":{"title":"Genomic Data Visualization & Analysis | Computational Genomics Class Notes","description":"Study guides to review Genomic Data Visualization & Analysis. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"raivLMuWfqHSK3Sq","type":"STUDY_GUIDE","title":"11.1 Genome browsers","slug":"genome-browsers","date":null,"keyTopics":[],"publicId":"raivLMuWfqHSK3Sq","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["G9xJJL60dKSpcNI1"],"duration":8},{"id":"PAMGQShhrae2Cdjp","type":"STUDY_GUIDE","title":"11.2 Heatmaps and clustering","slug":"heatmaps-clustering","date":null,"keyTopics":[],"publicId":"PAMGQShhrae2Cdjp","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["OBuqpoGXUVsKq5Dm"],"duration":9},{"id":"M744RJzgEiCOV0CJ","type":"STUDY_GUIDE","title":"11.3 Principal component analysis (PCA)","slug":"principal-component-analysis-pca","date":null,"keyTopics":[],"publicId":"M744RJzgEiCOV0CJ","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["RdmL3h1dgNAQtrc9"],"duration":9},{"id":"L7ADV4agY8FtRgJH","type":"STUDY_GUIDE","title":"11.4 Network visualization","slug":"network-visualization","date":null,"keyTopics":[],"publicId":"L7ADV4agY8FtRgJH","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["9HQUTaFr6QdZKRNU"],"duration":8},{"id":"sIv6AWYoN55yVmUo","type":"STUDY_GUIDE","title":"11.5 Data integration and multi-omics analysis","slug":"data-integration-multi-omics-analysis","date":null,"keyTopics":[],"publicId":"sIv6AWYoN55yVmUo","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["KaaXb9P1Hoo0nMiN"],"duration":12}],"numResources":1},{"id":"nmNe1iFif3JsxiVB","name":"Unit 12 – Ethical & Social Impact of Genomics","emoji":"📚","slug":"unit-12","description":"Unit 12: Ethical and Social Implications of Genomic Research","intro":"Genomics, the study of an organism's complete DNA set, has revolutionized our understanding of genetics and health. From genome sequencing to personalized medicine, this field offers powerful insights into genetic variations and disease susceptibility.\n\nEthical considerations in genomics research are paramount. Principles like respect for persons, beneficence, and justice guide researchers in protecting participants' rights and privacy. Social implications of genetic testing, data protection, and genetic discrimination are key challenges facing the field.","overview":"## Key Concepts in Genomics\n- Genomics involves the study of an organism's complete set of DNA, including all genes and non-coding regions\n- Genome sequencing technologies (Sanger sequencing, next-generation sequencing) have revolutionized the field by enabling rapid and cost-effective analysis of entire genomes\n - Sanger sequencing, developed in the 1970s, was the first widely used method for DNA sequencing\n - Next-generation sequencing (NGS) platforms, introduced in the 2000s, have greatly increased the speed and reduced the cost of genome sequencing\n- Bioinformatics plays a crucial role in genomics by providing computational tools and algorithms for analyzing and interpreting vast amounts of genomic data\n- Genomic variations, such as single nucleotide polymorphisms (SNPs) and structural variations (insertions, deletions, and copy number variations), contribute to genetic diversity and can influence an individual's susceptibility to diseases\n- Genome-wide association studies (GWAS) aim to identify genetic variants associated with specific traits or diseases by comparing the genomes of affected and unaffected individuals\n- Personalized medicine leverages genomic information to tailor medical treatments and interventions based on an individual's genetic profile, potentially improving treatment outcomes and reducing adverse drug reactions\n- Pharmacogenomics focuses on understanding how genetic variations influence an individual's response to medications, enabling the development of targeted therapies and optimized drug dosing\n- Epigenetics, the study of heritable changes in gene expression that do not involve alterations to the underlying DNA sequence, adds another layer of complexity to genomic research and has implications for understanding gene regulation and disease development\n\n## Ethical Principles in Genomics Research\n- Respect for persons emphasizes the importance of protecting the autonomy of research participants and ensuring that they are fully informed about the risks and benefits of participating in genomic studies\n - This principle requires obtaining informed consent from participants, allowing them to make voluntary decisions about their involvement in research\n- Beneficence obliges researchers to maximize the potential benefits of their work while minimizing harm to participants and society as a whole\n- Justice demands fair and equitable distribution of the benefits and burdens of genomic research, ensuring that no group is disproportionately affected or excluded from participation\n- Privacy and confidentiality are critical ethical considerations in genomics research, as genetic information is highly personal and sensitive\n - Researchers must implement robust data protection measures to safeguard participants' genomic data and prevent unauthorized access or misuse\n- Transparency and openness in research practices, including data sharing and publication of findings, promote scientific progress and accountability while respecting participant privacy\n- Community engagement involves actively involving relevant stakeholders, such as patient advocacy groups and underrepresented communities, in the research process to ensure that their perspectives and concerns are addressed\n- Responsible communication of research findings is essential to prevent misinterpretation or misuse of genomic information by the public, media, or policymakers\n - Researchers should present their findings in an accurate, balanced, and accessible manner, emphasizing the limitations and uncertainties inherent in genomic research\n\n## Social Implications of Genetic Testing\n- Genetic testing can provide individuals with valuable information about their health risks, enabling proactive measures for disease prevention and early detection\n - For example, genetic testing for BRCA1 and BRCA2 mutations can identify individuals at increased risk of developing breast and ovarian cancer, allowing for enhanced screening and preventive interventions\n- However, genetic testing results can also have significant psychological and emotional impacts on individuals and their families, particularly when revealing predisposition to serious or untreatable conditions\n- The interpretation and communication of genetic testing results are complex and require appropriate genetic counseling to help individuals understand the implications and make informed decisions\n- Genetic testing can raise concerns about privacy and discrimination, as individuals may face social stigma or discrimination based on their genetic information\n - For instance, employers or insurance companies might use genetic information to make decisions about hiring, promotion, or coverage, leading to potential genetic discrimination\n- The availability and accessibility of genetic testing services can exacerbate existing health disparities, as disadvantaged populations may have limited access to these technologies and the associated benefits\n- Direct-to-consumer (DTC) genetic testing, which allows individuals to access genetic testing services without the involvement of healthcare professionals, raises additional concerns about the accuracy, interpretation, and potential misuse of genetic information\n- The use of genetic testing in reproductive decision-making, such as preimplantation genetic diagnosis (PGD) and prenatal testing, raises ethical questions about the selection of embryos or the termination of pregnancies based on genetic characteristics\n- The societal implications of genetic testing extend beyond individual health, as the aggregation of genetic data can contribute to population-level research and public health initiatives, but also raises concerns about group privacy and the potential for genetic profiling\n\n## Privacy and Data Protection in Genomics\n- Genomic data is highly sensitive and personal, as it can reveal information about an individual's health, ancestry, and potentially their family members\n- Protecting the privacy and confidentiality of genomic data is essential to maintain public trust in genomic research and prevent misuse or unauthorized disclosure of personal information\n- Informed consent processes must clearly communicate how genomic data will be collected, stored, and shared, allowing individuals to make informed decisions about their participation in research or genetic testing\n- De-identification techniques, such as anonymization or pseudonymization, can help protect individual privacy by removing or replacing personally identifiable information from genomic datasets\n - However, the uniqueness of genomic data makes complete anonymization challenging, as re-identification may be possible through data triangulation or inference attacks\n- Data access control measures, such as tiered access systems and secure data enclaves, can help ensure that only authorized individuals have access to sensitive genomic data for legitimate research purposes\n- Encryption and secure data storage practices are essential to protect genomic data from unauthorized access, breaches, or cyber attacks\n- Governance frameworks, such as data access committees and data use agreements, provide oversight and guidance on the responsible sharing and use of genomic data, balancing research needs with individual privacy rights\n- International data sharing initiatives, such as the Global Alliance for Genomics and Health (GA4GH), aim to develop harmonized standards and protocols for the secure and ethical sharing of genomic data across borders\n - These efforts are crucial for advancing genomic research and medicine, but also require careful consideration of varying legal and cultural contexts surrounding data privacy and protection\n\n## Genetic Discrimination and Legal Frameworks\n- Genetic discrimination occurs when individuals are treated unfairly or denied opportunities based on their genetic information, such as predisposition to certain diseases or health conditions\n- Discrimination can occur in various contexts, including employment, insurance, and social interactions, leading to stigmatization and unequal treatment of individuals based on their genetic characteristics\n- In the United States, the Genetic Information Nondiscrimination Act (GINA) of 2008 prohibits discrimination based on genetic information in employment and health insurance\n - GINA protects individuals from being denied employment or health insurance coverage based on their genetic test results or family medical history\n - However, GINA does not cover other forms of insurance, such as life, disability, or long-term care insurance, leaving potential gaps in protection against genetic discrimination\n- Other countries have enacted similar anti-discrimination laws, such as the Genetic Non-Discrimination Act in Canada and the Genetic Discrimination Law in Israel, to protect individuals from unfair treatment based on their genetic information\n- Despite legal protections, the fear of genetic discrimination can still deter individuals from undergoing genetic testing or participating in genomic research, potentially limiting the benefits of these technologies\n- Employers and insurance companies may still use genetic information indirectly, such as through family medical history or by inferring genetic predispositions from other health data, making it difficult to detect and prove instances of genetic discrimination\n- Legal frameworks must continually evolve to keep pace with advances in genomic technologies and the changing landscape of genetic discrimination risks\n- Public education and awareness campaigns are essential to help individuals understand their rights and protections against genetic discrimination and to promote the responsible use of genetic information in society\n\n## Cultural and Religious Perspectives on Genomics\n- Cultural and religious beliefs can significantly influence individuals' attitudes towards genomic research, genetic testing, and the application of genomic technologies\n- Some religious traditions, such as Catholicism and Islam, may have concerns about the use of genetic technologies for purposes that conflict with their moral or ethical principles, such as embryonic stem cell research or genetic modification\n- Indigenous communities may view genomic research as a form of cultural appropriation or exploitation, particularly if their genetic heritage is studied without proper consent, respect for their traditions, or benefit-sharing arrangements\n - For example, the Havasupai Tribe in the United States sued researchers for using their genetic samples for studies beyond the originally consented diabetes research, highlighting the importance of culturally sensitive and participatory research practices\n- Certain cultural groups may have stigmas or taboos surrounding genetic disorders or disabilities, leading to the marginalization or discrimination of affected individuals and their families\n- Religious beliefs about the sanctity of life and the moral status of embryos can shape attitudes towards reproductive genetic technologies, such as preimplantation genetic diagnosis (PGD) and prenatal testing\n- Cultural norms around family structure, lineage, and marriage may influence the uptake and interpretation of genetic testing results, particularly in relation to carrier screening for recessive disorders or the disclosure of incidental findings\n- Effective communication and cultural competency are essential for healthcare providers and researchers to navigate the diverse cultural and religious perspectives on genomics and provide culturally sensitive care and support\n- Engaging with community leaders, religious authorities, and cultural organizations can help foster trust, understanding, and collaboration between genomic researchers and the communities they serve\n- Incorporating cultural and religious perspectives into the design and implementation of genomic research and healthcare initiatives can help ensure that the benefits of genomic technologies are realized in a socially and ethically responsible manner\n\n## Equity and Access in Genomic Medicine\n- Genomic medicine has the potential to revolutionize healthcare by enabling personalized approaches to disease prevention, diagnosis, and treatment based on an individual's genetic profile\n- However, there are significant disparities in access to genomic technologies and services, both within and between countries, which can exacerbate existing health inequities\n- Socioeconomic factors, such as income, education, and geographic location, can influence an individual's ability to access and afford genomic testing and personalized treatments\n - For example, the high cost of genomic sequencing and targeted therapies may limit their availability to individuals with comprehensive health insurance or the financial means to pay out-of-pocket\n- Disparities in genomic research participation can lead to an underrepresentation of certain populations, such as racial and ethnic minorities, in genomic databases and studies, limiting the generalizability and applicability of research findings\n- Limited diversity in genomic reference databases can result in biased interpretations of genetic variants and reduced accuracy of genomic risk assessments for underrepresented populations\n- Inadequate access to genetic counseling services and limited genomic literacy among healthcare providers and the public can hinder the effective communication and utilization of genomic information in healthcare decision-making\n- Efforts to improve equity and access in genomic medicine include increasing diversity in genomic research cohorts, developing low-cost and accessible genomic technologies, and promoting genomic education and awareness\n- Collaborative international initiatives, such as the Human Heredity and Health in Africa (H3Africa) consortium, aim to build capacity for genomic research and medicine in low- and middle-income countries, promoting global health equity\n- Integrating genomic medicine into public health systems and universal healthcare coverage can help ensure that the benefits of genomic technologies are distributed fairly and reach those most in need\n- Addressing the social, economic, and political determinants of health is crucial for realizing the full potential of genomic medicine and reducing health disparities at the population level\n\n## Future Challenges and Considerations\n- As genomic technologies continue to advance rapidly, it is essential to anticipate and address the ethical, legal, and social implications (ELSI) that may arise in the future\n- The increasing availability and affordability of whole-genome sequencing may lead to a surge in the generation of personal genomic data, raising concerns about data privacy, security, and ownership\n - Developing robust data governance frameworks and encryption technologies will be crucial to protect individual privacy while enabling responsible data sharing for research and clinical purposes\n- The proliferation of direct-to-consumer (DTC) genetic testing services may pose challenges for the regulation and quality control of genomic information provided to consumers\n - Ensuring the accuracy, validity, and appropriate interpretation of DTC genetic testing results will require collaboration between regulatory bodies, industry stakeholders, and healthcare professionals\n- The integration of genomic data with other types of personal data, such as electronic health records, wearables, and social media, may create new opportunities for personalized medicine but also raise concerns about data linkage, privacy, and potential misuse\n- The increasing use of artificial intelligence (AI) and machine learning algorithms in genomic data analysis may introduce biases and exacerbate disparities if not developed and applied responsibly\n - Ensuring the transparency, accountability, and fairness of AI-driven genomic tools will be essential to maintain public trust and prevent unintended consequences\n- The potential for gene editing technologies, such as CRISPR-Cas9, to be used for germline modifications in human embryos raises profound ethical questions about the permissibility and societal implications of altering the human genome\n - Developing global governance frameworks and public engagement initiatives will be necessary to guide the responsible development and use of gene editing technologies\n- Addressing the equitable distribution of the benefits and risks of genomic technologies will require ongoing efforts to promote diversity, inclusion, and social justice in genomic research and medicine\n- Fostering interdisciplinary collaboration among scientists, healthcare professionals, ethicists, policymakers, and community stakeholders will be essential to navigate the complex challenges and opportunities presented by the future of genomics\n- Investing in genomics education and literacy initiatives for both healthcare providers and the public will be crucial to ensure the responsible and effective translation of genomic discoveries into clinical practice and public health interventions","active":true,"order":12,"meta":{"title":"Ethical & Social Impact of Genomics | Computational Genomics Class Notes","description":"Study guides to review Ethical & Social Impact of Genomics. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"fednbzdoGHz5VT9l","type":"STUDY_GUIDE","title":"12.1 Informed consent and privacy","slug":"informed-consent-privacy","date":null,"keyTopics":[],"publicId":"fednbzdoGHz5VT9l","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["gBHqBENZr3D5d9zG"],"duration":9},{"id":"LaQxvQNh1qTc5EI5","type":"STUDY_GUIDE","title":"12.2 Genetic discrimination","slug":"genetic-discrimination","date":null,"keyTopics":[],"publicId":"LaQxvQNh1qTc5EI5","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["jtM5dFlV7eYbkAAp"],"duration":7},{"id":"7v2GgofphNghAx0q","type":"STUDY_GUIDE","title":"12.3 Incidental findings and return of results","slug":"incidental-findings-return-results","date":null,"keyTopics":[],"publicId":"7v2GgofphNghAx0q","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["uujlXwqVc6mUQRf2"],"duration":11},{"id":"gVNAmJCqvPAzvEaL","type":"STUDY_GUIDE","title":"12.4 Ownership and sharing of genomic data","slug":"ownership-sharing-genomic-data","date":null,"keyTopics":[],"publicId":"gVNAmJCqvPAzvEaL","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["7kqNbl7iFwVnxxZQ"],"duration":13},{"id":"wCAhhTjWLtUVlZf0","type":"STUDY_GUIDE","title":"12.5 Genomics and personalized medicine","slug":"genomics-personalized-medicine","date":null,"keyTopics":[],"publicId":"wCAhhTjWLtUVlZf0","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["AqpRI7LetZ41pII9"],"duration":8}],"numResources":1}],"exams":[]},"unit":{"id":"QRsrBZfXY9zICIhp","name":"Unit 9 – RNA-seq: Transcriptome Analysis","emoji":"📚","slug":"unit-9","description":"Unit 9: Transcriptomics and RNA-seq Analysis","intro":"RNA-seq is a powerful tool for studying gene expression in cells and tissues. It allows researchers to quantify RNA levels, identify new transcripts, and detect alternative splicing events. This technique has revolutionized transcriptomics by enabling genome-wide analysis of gene expression at unprecedented depth and resolution.\n\nKey concepts in transcriptomics include the transcriptome, gene expression, alternative splicing, and non-coding RNAs. Understanding these concepts is crucial for designing RNA-seq experiments, analyzing data, and interpreting results. Proper experimental design, sample preparation, and quality control are essential for generating reliable RNA-seq data.","overview":"## What's RNA-seq?\n- RNA-seq (RNA sequencing) is a high-throughput sequencing technology used to study the transcriptome, which is the complete set of RNA transcripts in a cell or tissue at a specific time point\n- Enables researchers to quantify gene expression levels, identify novel transcripts, and discover alternative splicing events by directly sequencing the RNA molecules present in a sample\n- Provides a snapshot of the active genes and their expression levels under specific conditions (developmental stage, disease state, or treatment)\n- Offers several advantages over traditional gene expression profiling methods (microarrays) including higher sensitivity, wider dynamic range, and the ability to detect novel transcripts without prior knowledge of the genome sequence\n- Generates large amounts of data that require computational analysis to extract biologically meaningful insights\n - Typical RNA-seq experiment produces millions of short reads (50-150 base pairs) that need to be aligned to a reference genome or assembled de novo\n - Expression levels are quantified by counting the number of reads mapping to each gene or transcript\n- Has revolutionized the field of transcriptomics by enabling genome-wide analysis of gene expression at an unprecedented resolution and depth\n- Widely applied in various research areas (developmental biology, cancer research, and plant sciences) to understand the molecular mechanisms underlying biological processes and diseases\n\n## Key Concepts in Transcriptomics\n- Transcriptome refers to the complete set of RNA transcripts present in a cell or tissue at a given time point, including messenger RNAs (mRNAs), non-coding RNAs (ncRNAs), and small RNAs\n- Gene expression is the process by which the information encoded in a gene is used to synthesize functional gene products, primarily proteins\n - Expression levels can be quantified by measuring the abundance of mRNA transcripts produced from each gene\n - Differential gene expression refers to the changes in expression levels between different conditions (e.g., healthy vs. diseased, treated vs. untreated)\n- Alternative splicing is a regulated process during gene expression that allows a single gene to produce multiple mRNA isoforms, potentially encoding different protein variants\n - Occurs when exons are included or excluded from the final processed mRNA, or when introns are retained\n - Contributes to the diversity of the proteome and plays a crucial role in cell differentiation, development, and disease\n- Non-coding RNAs (ncRNAs) are functional RNA molecules that are not translated into proteins, but have important regulatory roles in gene expression and cellular processes\n - Examples include long non-coding RNAs (lncRNAs), microRNAs (miRNAs), and small interfering RNAs (siRNAs)\n- RNA editing is a post-transcriptional modification that alters the nucleotide sequence of an RNA molecule, potentially changing the amino acid sequence of the encoded protein or affecting the stability and localization of the RNA\n- Gene fusion occurs when two separate genes are joined together due to chromosomal rearrangements, resulting in the production of a chimeric RNA and potentially a fusion protein with altered function\n - Often associated with cancer development and can serve as diagnostic biomarkers or therapeutic targets\n\n## RNA-seq Experimental Design\n- Defining the research question and hypothesis is crucial for designing an appropriate RNA-seq experiment\n - Clearly state the biological question to be addressed and the specific hypotheses to be tested\n - Consider the type of samples to be analyzed (e.g., tissues, cell lines, or single cells) and the comparisons to be made (e.g., treatment vs. control, time course, or different developmental stages)\n- Determining the sequencing depth and read length depends on the research goals and the complexity of the transcriptome\n - Higher sequencing depth (more reads per sample) increases the sensitivity to detect low-abundance transcripts and rare isoforms\n - Longer read lengths (100-150 bp) improve the accuracy of transcript assembly and isoform identification\n - Balance between sequencing depth and number of biological replicates based on budget and experimental design\n- Selecting the appropriate number of biological replicates is essential for statistical power and reproducibility\n - Biological replicates (independent samples from different individuals or experiments) capture the biological variability and allow for robust differential expression analysis\n - At least three biological replicates per condition are recommended, with more replicates increasing the power to detect significant differences\n- Choosing the RNA extraction and library preparation methods based on the sample type and research objectives\n - Total RNA sequencing captures all RNA species, including mRNAs, ncRNAs, and small RNAs\n - mRNA sequencing (poly(A) selection) enriches for mature mRNAs by targeting the poly(A) tail\n - Ribosomal RNA (rRNA) depletion removes the highly abundant rRNA molecules to increase the coverage of other RNA species\n - Strand-specific protocols preserve the information about the originating strand of the RNA transcripts\n- Planning for data storage, management, and analysis infrastructure is important given the large volume of data generated by RNA-seq experiments\n - Raw sequencing data (FASTQ files) and processed data (aligned reads, count matrices) require significant storage capacity\n - Computational resources (high-performance computing clusters or cloud-based platforms) are necessary for data processing and analysis\n - Establish a data management plan for organizing, backing up, and sharing the data in accordance with FAIR (Findable, Accessible, Interoperable, and Reusable) principles\n\n## Sample Prep and Sequencing\n- RNA extraction is the first step in sample preparation, which involves isolating total RNA from the biological samples\n - Use commercially available kits (TRIzol, RNeasy) or phenol-chloroform extraction methods\n - Assess RNA quality and integrity using spectrophotometry (NanoDrop) and capillary electrophoresis (Bioanalyzer or TapeStation)\n - High-quality RNA with minimal degradation (RIN > 8) is essential for successful library preparation and sequencing\n- Library preparation converts the RNA molecules into cDNA libraries compatible with the sequencing platform\n - Fragmentation of RNA into smaller pieces (200-500 bp) to ensure uniform coverage across the transcriptome\n - Reverse transcription to synthesize cDNA from the fragmented RNA using random hexamer primers\n - Adapter ligation to attach platform-specific sequences to the ends of the cDNA fragments, enabling amplification and sequencing\n - Amplification of the cDNA library using PCR to increase the amount of material for sequencing\n - Size selection to enrich for fragments of the desired length and remove adapter dimers and other artifacts\n- Multiplexing allows for the simultaneous sequencing of multiple samples in a single run by using unique barcodes (sample-specific sequences) added during library preparation\n - Reduces sequencing costs and increases throughput\n - Requires careful design to ensure balanced representation of samples and avoid barcode crosstalk\n- Sequencing platforms (Illumina, PacBio, Oxford Nanopore) generate millions to billions of short reads (50-150 bp) or longer reads (1-100 kb) from the cDNA libraries\n - Illumina sequencing (HiSeq, NextSeq) is the most widely used platform for RNA-seq, offering high accuracy, throughput, and cost-effectiveness\n - PacBio and Oxford Nanopore sequencing provide longer reads that can improve the resolution of complex isoforms and splice variants, but have higher error rates and lower throughput compared to Illumina\n- Sequencing depth and read length should be chosen based on the research objectives and the complexity of the transcriptome\n - Aim for at least 20-30 million reads per sample for differential expression analysis of coding genes\n - Increase sequencing depth (50-100 million reads) for detecting low-abundance transcripts, non-coding RNAs, or rare isoforms\n - Longer read lengths (100-150 bp) improve the accuracy of transcript assembly and isoform identification\n\n## Quality Control and Preprocessing\n- Quality assessment of raw sequencing data (FASTQ files) is crucial for identifying and addressing any issues that may affect downstream analysis\n - Use tools like FastQC or MultiQC to generate quality control reports\n - Check for base quality scores, GC content, sequence duplication levels, and overrepresented sequences (adapters, primers)\n - Low-quality bases (Q < 20) and adapter sequences should be trimmed using tools like Trimmomatic or Cutadapt to improve the accuracy of alignment and quantification\n- Filtering out low-quality reads and contaminants helps to reduce noise and improve the signal-to-noise ratio in the data\n - Remove reads with a high proportion of low-quality bases (e.g., >50% bases with Q < 20)\n - Discard reads aligning to ribosomal RNA (rRNA) or other contaminating sequences (e.g., PhiX control) using tools like SortMeRNA or Bowtie2\n - Trim or filter out reads with adapter sequences, poly(A) tails, or other technical artifacts\n- Read trimming involves removing low-quality bases and adapter sequences from the ends of the reads\n - Performed using tools like Trimmomatic, Cutadapt, or BBDuk\n - Improves the accuracy of alignment and quantification by ensuring that only high-quality bases are used in the analysis\n - Trims bases below a specified quality threshold (e.g., Q < 20) from the 3' end of the reads\n - Removes adapter sequences by matching the read sequences against a library of known adapter sequences\n- Read deduplication identifies and removes PCR duplicates, which are reads originating from the same cDNA fragment during library amplification\n - PCR duplicates can introduce biases in quantification and lead to overestimation of expression levels\n - Tools like Picard MarkDuplicates or Clumpify can be used to identify and remove duplicate reads based on their alignment positions\n - Deduplication is more important for low-complexity libraries (e.g., small RNA-seq) or when using PCR-based library preparation methods\n- Quality control metrics and thresholds should be carefully evaluated and reported to ensure the reproducibility and reliability of the results\n - Base quality scores: Aim for a median Q-score > 30 and no more than 20% bases with Q < 20\n - Adapter content: Less than 10% of reads should contain adapter sequences after trimming\n - rRNA contamination: Less than 5% of reads should align to rRNA sequences\n - Alignment rate: At least 70-80% of reads should align uniquely to the reference genome or transcriptome\n - Read duplication rate: Depends on the library complexity and preparation method, but should be consistent across samples\n\n## Alignment and Quantification\n- Alignment maps the preprocessed reads to a reference genome or transcriptome to determine their originating locations\n - Spliced aligners (STAR, HISAT2, TopHat2) are used to handle reads spanning exon-exon junctions\n - Alignment parameters (e.g., mismatch rate, gap penalties) should be optimized based on the read length and quality\n - Alignment quality metrics (e.g., uniquely mapped reads, multi-mapped reads, unmapped reads) should be evaluated to assess the quality of the alignment\n- Transcript assembly reconstructs the full-length transcripts from the aligned reads, allowing for the identification of novel isoforms and splice variants\n - Reference-guided assembly (Cufflinks, StringTie) uses the reference genome annotation to guide the assembly process\n - De novo assembly (Trinity, Oases) reconstructs transcripts without relying on a reference genome, enabling the discovery of novel transcripts in non-model organisms\n - Hybrid assembly approaches (HISAT-StringTie, STAR-Cufflinks) combine reference-guided and de novo methods to improve the accuracy and completeness of the assembly\n- Quantification estimates the expression levels of genes and transcripts by counting the number of reads or fragments mapping to each feature\n - Count-based methods (HTSeq, featureCounts) assign reads to genes or exons based on their alignment positions\n - Requires a gene annotation file (GTF/GFF) to define the genomic coordinates of the features\n - Generates a count matrix with rows representing genes and columns representing samples\n - Transcript-level quantification (RSEM, Kallisto, Salmon) estimates the abundance of individual isoforms by probabilistically assigning reads to transcripts\n - Uses the principles of pseudoalignment or quasi-mapping to rapidly assign reads to transcripts without the need for a full alignment\n - Outputs TPM (Transcripts Per Million) or FPKM (Fragments Per Kilobase Million) values, which normalize for transcript length and library size\n- Normalization adjusts the raw read counts to account for differences in library size, sequencing depth, and other technical factors that may affect the comparison of expression levels across samples\n - CPM (Counts Per Million) and RPM (Reads Per Million) normalize the read counts by the total number of mapped reads in each sample\n - TPM (Transcripts Per Million) and FPKM (Fragments Per Kilobase Million) additionally normalize for transcript length, making them more suitable for comparing expression levels across genes\n - TMM (Trimmed Mean of M-values) and DESeq2's median-of-ratios method are more robust normalization methods that account for differences in library composition and reduce the impact of highly expressed genes\n- Batch effect correction removes systematic biases introduced by technical factors (e.g., sequencing run, library preparation batch) that can confound the biological variation of interest\n - Tools like ComBat, SVA, and RUVSeq can be used to identify and correct for batch effects using statistical methods\n - Batch effects should be assessed and corrected for before downstream analysis to avoid false positives and improve the reproducibility of the results\n\n## Differential Expression Analysis\n- Differential expression analysis identifies genes or transcripts that are significantly up- or down-regulated between experimental conditions\n - Requires a count matrix with normalized read counts for each gene or transcript across all samples\n - Compares the expression levels between two or more groups (e.g., treatment vs. control, time points, tissue types) to identify genes with statistically significant changes in expression\n- Statistical methods for differential expression analysis model the distribution of read counts and test for significant differences between conditions\n - Negative binomial distribution (DESeq2, edgeR) is commonly used to model the overdispersion of read counts, accounting for both biological and technical variability\n - Generalized linear models (GLMs) are used to test for differential expression while controlling for confounding factors (e.g., batch effects, covariates)\n - Likelihood ratio tests (LRT) or Wald tests are used to assess the significance of the differential expression, generating p-values for each gene or transcript\n- Multiple testing correction adjusts the p-values to control for the false discovery rate (FDR) when performing numerous simultaneous hypothesis tests\n - Bonferroni correction is a conservative method that multiplies the p-values by the number of tests performed, ensuring a family-wise error rate (FWER) of less than the specified threshold (e.g., 0.05)\n - Benjamini-Hochberg procedure is a more powerful method that controls the FDR, which is the expected proportion of false positives among all significant results\n - Adjusted p-values (q-values) < 0.05 are typically considered significant, but the threshold can be adjusted based on the desired balance between sensitivity and specificity\n- Fold change cutoffs are often used in combination with statistical significance to identify biologically meaningful changes in expression\n - Fold change (FC) is calculated as the ratio of the average normalized expression levels between two conditions (e.g., treatment/control)\n - Log2 fold change (LFC) is the logarithm (base 2) of the fold change, with positive values indicating up-regulation and negative values indicating down-regulation\n - Commonly used fold change cutoffs are |FC| > 2 or |LFC| > 1, but the choice of threshold depends on the biological context and the desired level of stringency\n- Visualization of differential expression results helps to interpret and communicate the findings\n - Volcano plots display the statistical significance (-log10 p-value) against the fold change (log2 FC) for each gene, highlighting genes that are both statistically significant and biologically meaningful\n - Heatmaps cluster genes and samples based on their expression patterns, revealing groups of co-regulated genes and sample relationships\n - MA plots (M-A plots) show the average expression level (A) against the fold change (M) for each gene, helping to identify intensity-dependent biases and assess the overall distribution of differential expression\n\n## Functional Interpretation\n- Gene set enrichment analysis (GS","active":true,"order":9,"meta":{"title":"RNA-seq: Transcriptome Analysis | Computational Genomics Class Notes","description":"Study guides to review RNA-seq: Transcriptome Analysis. For college students taking Computational Genomics."},"metaDesc":null,"resources":[{"id":"EmX26AxpxQtHCk2T","type":"STUDY_GUIDE","title":"9.1 RNA isolation and library preparation","slug":"rna-isolation-library-preparation","date":null,"keyTopics":[],"publicId":"EmX26AxpxQtHCk2T","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["5Bx3HMnMEh8FGIwo"],"duration":11},{"id":"Sgbz2v78n5w6Jpvf","type":"STUDY_GUIDE","title":"9.2 RNA-seq data analysis","slug":"rna-seq-data-analysis","date":null,"keyTopics":[],"publicId":"Sgbz2v78n5w6Jpvf","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["1UYjIcOGfFz9BTxr"],"duration":12},{"id":"XgFJq4aaYf0bUT1s","type":"STUDY_GUIDE","title":"9.4 Alternative splicing analysis","slug":"alternative-splicing-analysis","date":null,"keyTopics":[],"publicId":"XgFJq4aaYf0bUT1s","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["Oyn4QsZ7wbM7upzy"],"duration":8},{"id":"YaOPd55WlPDCFUlp","type":"STUDY_GUIDE","title":"9.3 Differential gene expression","slug":"differential-gene-expression","date":null,"keyTopics":[],"publicId":"YaOPd55WlPDCFUlp","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["Ixh9gEuKk3jFLbVE"],"duration":10},{"id":"6GSEt5qoBER92OkV","type":"STUDY_GUIDE","title":"9.5 Gene co-expression networks","slug":"gene-co-expression-networks","date":null,"keyTopics":[],"publicId":"6GSEt5qoBER92OkV","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"computational-genomics"},"streamers":[],"creators":[],"topicIds":["ALySWGgX9MQUg44N"],"duration":13}],"numResources":1}}]}]]