1a:[[["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"itemListElement\":[]}"}}],["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"BreadcrumbList\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Intro To Biostatistics\",\"item\":\"https://library.fiveable.me/introduction-to-biostatistics\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Unit 11 – Statistical Software & Data Management\",\"item\":\"https://library.fiveable.me/introduction-to-biostatistics/unit-11\"}]}"}}]],["$","$L1b",null,{"initialReduxState":{"initialToc":{"units":[{"id":"3JKbwuGd1WPCxkn4","name":"Unit 1 – Descriptive Statistics","emoji":"📚","slug":"unit-1","hasResources":true,"resources":[{"id":"1O4EJ8pgTtT3hwre","title":"1.2 Measures of variability","slug":"measures-variability","type":"STUDY_GUIDE","date":null},{"id":"AShdABuaEPHvUkOX","title":"1.3 Data visualization techniques","slug":"data-visualization-techniques","type":"STUDY_GUIDE","date":null},{"id":"SJFSIp89wbjNlmXy","title":"1.4 Frequency distributions","slug":"frequency-distributions","type":"STUDY_GUIDE","date":null},{"id":"zS1nT2lH15ARMEao","title":"1.1 Measures of central tendency","slug":"measures-central-tendency","type":"STUDY_GUIDE","date":null},{"id":"7k8iyFWgRaojOQfV","title":"1.5 Percentiles and quartiles","slug":"percentiles-quartiles","type":"STUDY_GUIDE","date":null}]},{"id":"aJKTALy4JO4Ha2HP","name":"Unit 2 – Probability Theory","emoji":"📚","slug":"unit-2","hasResources":true,"resources":[{"id":"78qVzpxfnnqB0ffq","title":"2.5 Random variables","slug":"random-variables","type":"STUDY_GUIDE","date":null},{"id":"BU58gyLMfFXIuB3C","title":"2.1 Basic probability concepts","slug":"basic-probability-concepts","type":"STUDY_GUIDE","date":null},{"id":"5z4EKgr8lbFrtIP9","title":"2.2 Probability distributions","slug":"probability-distributions","type":"STUDY_GUIDE","date":null},{"id":"msSAV8mbsAWYevRE","title":"2.3 Conditional probability","slug":"conditional-probability","type":"STUDY_GUIDE","date":null},{"id":"bpMo0NUQ16NeHesD","title":"2.4 Bayes' theorem","slug":"bayes-theorem","type":"STUDY_GUIDE","date":null}]},{"id":"b1830ayB02ogOzzO","name":"Unit 3 – Sampling Distributions","emoji":"📚","slug":"unit-3","hasResources":true,"resources":[{"id":"Z9UiEYxDfIVxibiI","title":"3.1 Central Limit Theorem","slug":"central-limit-theorem","type":"STUDY_GUIDE","date":null},{"id":"DH7vkKyKkP1nOHyQ","title":"3.2 Standard error","slug":"standard-error","type":"STUDY_GUIDE","date":null},{"id":"6OREsMCeFCfhONQ3","title":"3.3 Sampling distribution of the mean","slug":"sampling-distribution","type":"STUDY_GUIDE","date":null},{"id":"1G7IUwO0pJEmakGw","title":"3.4 Sampling distribution of the proportion","slug":"sampling-distribution-proportion","type":"STUDY_GUIDE","date":null},{"id":"xtzdxxU3t8kDuZhX","title":"3.5 T-distribution","slug":"t-distribution","type":"STUDY_GUIDE","date":null}]},{"id":"mFxTb3hBEGL2TIbD","name":"Unit 4 – Hypothesis Testing","emoji":"📚","slug":"unit-4","hasResources":true,"resources":[{"id":"sCtn0fY8EqV54NyF","title":"4.3 P-values","slug":"p-values","type":"STUDY_GUIDE","date":null},{"id":"e8FiRZIVBSyn3ZsQ","title":"4.1 Null and alternative hypotheses","slug":"null-alternative-hypotheses","type":"STUDY_GUIDE","date":null},{"id":"4HFN174Cffq1h1dl","title":"4.2 Type I and Type II errors","slug":"type-type-ii-errors","type":"STUDY_GUIDE","date":null},{"id":"p51GnPotzpReA1YZ","title":"4.4 Statistical power","slug":"statistical-power","type":"STUDY_GUIDE","date":null},{"id":"OMTloyn2JzB4y9sj","title":"4.5 One-sample tests","slug":"one-sample-tests","type":"STUDY_GUIDE","date":null},{"id":"CCihz0QBeycja5xI","title":"4.6 Two-sample tests","slug":"two-sample-tests","type":"STUDY_GUIDE","date":null}]},{"id":"5HqBzzlN71HHuIbr","name":"Unit 5 – Confidence Intervals in Biostatistics","emoji":"📚","slug":"unit-5","hasResources":true,"resources":[{"id":"yzoYEpYaMr6OybfI","title":"5.3 Confidence interval for the difference between means","slug":"confidence-interval-difference-means","type":"STUDY_GUIDE","date":null},{"id":"cn0l264Li1Lhmo6l","title":"5.1 Confidence interval for the mean","slug":"confidence-interval","type":"STUDY_GUIDE","date":null},{"id":"L5iQST0TYsoVUt4Y","title":"5.2 Confidence interval for the proportion","slug":"confidence-interval-proportion","type":"STUDY_GUIDE","date":null},{"id":"t3SOYhz7VsHCNY7a","title":"5.4 Confidence interval for the difference between proportions","slug":"confidence-interval-difference-proportions","type":"STUDY_GUIDE","date":null},{"id":"beytRL0iOZSeEzPK","title":"5.5 Interpreting confidence intervals","slug":"interpreting-confidence-intervals","type":"STUDY_GUIDE","date":null}]},{"id":"lrOzUV5QUEYM8Oz8","name":"Unit 6 – Regression Analysis","emoji":"📚","slug":"unit-6","hasResources":true,"resources":[{"id":"9STvcq6HC9ZMpi1B","title":"6.3 Logistic regression","slug":"logistic-regression","type":"STUDY_GUIDE","date":null},{"id":"mkiUjWxLZuIXeFcb","title":"6.1 Simple linear regression","slug":"simple-linear-regression","type":"STUDY_GUIDE","date":null},{"id":"dalNtc1LwM1X1VfS","title":"6.2 Multiple linear regression","slug":"multiple-linear-regression","type":"STUDY_GUIDE","date":null},{"id":"ibO4uUwy3WwUuSjw","title":"6.4 Model diagnostics","slug":"model-diagnostics","type":"STUDY_GUIDE","date":null},{"id":"wFuxzd24hwhbAiXv","title":"6.5 Correlation analysis","slug":"correlation-analysis","type":"STUDY_GUIDE","date":null}]},{"id":"pmM6iTkJlTrbCJhp","name":"Unit 7 – Analysis of Variance (ANOVA)","emoji":"📚","slug":"unit-7","hasResources":true,"resources":[{"id":"YOlFtb7aiCZsGamU","title":"7.5 Assumptions and diagnostics","slug":"assumptions-diagnostics","type":"STUDY_GUIDE","date":null},{"id":"60h4kp5pU0UzVNz7","title":"7.2 Two-way ANOVA","slug":"two-way-anova","type":"STUDY_GUIDE","date":null},{"id":"B6Y0sj2UQAFyRXEV","title":"7.3 Repeated measures ANOVA","slug":"repeated-measures-anova","type":"STUDY_GUIDE","date":null},{"id":"igNe9jDwwC8mMu8r","title":"7.4 Post-hoc tests","slug":"post-hoc-tests","type":"STUDY_GUIDE","date":null},{"id":"RfAD16ZrOUhh1kqw","title":"7.1 One-way ANOVA","slug":"one-way-anova","type":"STUDY_GUIDE","date":null}]},{"id":"53U9MGSYxB66PLbg","name":"Unit 8 – Experimental Design","emoji":"📚","slug":"unit-8","hasResources":true,"resources":[{"id":"pFXBLuu48ndsErJT","title":"8.1 Randomization","slug":"randomization","type":"STUDY_GUIDE","date":null},{"id":"PtDgoky63gDBaOrp","title":"8.2 Blinding","slug":"blinding","type":"STUDY_GUIDE","date":null},{"id":"PtNWzA6JlwMX79DD","title":"8.5 Factorial designs","slug":"factorial-designs","type":"STUDY_GUIDE","date":null},{"id":"nq1omVcRclG3w7vG","title":"8.3 Control groups","slug":"control-groups","type":"STUDY_GUIDE","date":null},{"id":"IgNoPSwD4JdUPNzl","title":"8.4 Sample size determination","slug":"sample-size-determination","type":"STUDY_GUIDE","date":null}]},{"id":"kVVMucz579uvt2r7","name":"Unit 9 – Survival Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"e4D2tkFWN44Q8fNq","title":"9.1 Kaplan-Meier estimator","slug":"kaplan-meier-estimator","type":"STUDY_GUIDE","date":null},{"id":"HGGR4TjJqb7DiHQy","title":"9.3 Censoring","slug":"censoring","type":"STUDY_GUIDE","date":null},{"id":"tbsbuwbhiTDaFU6Z","title":"9.5 Hazard ratios","slug":"hazard-ratios","type":"STUDY_GUIDE","date":null},{"id":"EfuaY8jSNC9ZUNSr","title":"9.4 Log-rank test","slug":"log-rank-test","type":"STUDY_GUIDE","date":null},{"id":"IiPy3TSXkudZ28cm","title":"9.2 Cox proportional hazards model","slug":"cox-proportional-hazards-model","type":"STUDY_GUIDE","date":null}]},{"id":"FD86VDr2IzTFDYGz","name":"Unit 10 – Epidemiological Measures","emoji":"📚","slug":"unit-10","hasResources":true,"resources":[{"id":"lpPmnlnjuaEDOqrx","title":"10.2 Relative risk","slug":"relative-risk","type":"STUDY_GUIDE","date":null},{"id":"OaxQ7C81fpnqbwej","title":"10.4 Sensitivity and specificity","slug":"sensitivity-specificity","type":"STUDY_GUIDE","date":null},{"id":"K0eQFC2DjGgjdIbx","title":"10.1 Incidence and prevalence","slug":"incidence-prevalence","type":"STUDY_GUIDE","date":null},{"id":"4hpy9rn5TvyC9J8w","title":"10.3 Odds ratio","slug":"odds-ratio","type":"STUDY_GUIDE","date":null},{"id":"EsfutmLq0YnqH0xX","title":"10.5 Attributable risk","slug":"attributable-risk","type":"STUDY_GUIDE","date":null}]},{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"uuCg3qOSetbDAukb","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","type":"STUDY_GUIDE","date":null},{"id":"ECob8I1hDH2SslWt","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","type":"STUDY_GUIDE","date":null},{"id":"fltCwHwrmxFkPzLD","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","type":"STUDY_GUIDE","date":null},{"id":"UdImkGkjLzRBsQ7s","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","type":"STUDY_GUIDE","date":null},{"id":"X9OMg6UVm28mXVLC","title":"11.3 Data visualization tools","slug":"data-visualization-tools","type":"STUDY_GUIDE","date":null}]}],"activeUnit":{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"uuCg3qOSetbDAukb","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","type":"STUDY_GUIDE","date":null},{"id":"ECob8I1hDH2SslWt","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","type":"STUDY_GUIDE","date":null},{"id":"fltCwHwrmxFkPzLD","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","type":"STUDY_GUIDE","date":null},{"id":"UdImkGkjLzRBsQ7s","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","type":"STUDY_GUIDE","date":null},{"id":"X9OMg6UVm28mXVLC","title":"11.3 Data visualization tools","slug":"data-visualization-tools","type":"STUDY_GUIDE","date":null}]}},"keyTerms":{"keyTerms":"$undefined"},"pageData":{"subject":{"id":"introduction-to-biostatistics","name":"Intro to Biostatistics","keyTermsActive":null,"generationMetadata":{}},"unit":{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"uuCg3qOSetbDAukb","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","type":"STUDY_GUIDE","date":null},{"id":"ECob8I1hDH2SslWt","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","type":"STUDY_GUIDE","date":null},{"id":"fltCwHwrmxFkPzLD","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","type":"STUDY_GUIDE","date":null},{"id":"UdImkGkjLzRBsQ7s","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","type":"STUDY_GUIDE","date":null},{"id":"X9OMg6UVm28mXVLC","title":"11.3 Data visualization tools","slug":"data-visualization-tools","type":"STUDY_GUIDE","date":null}]},"topic":"$undefined","content":"$undefined","apQuestionData":"$undefined"},"contentQueryData":{}},"initialToc":{"units":[{"id":"3JKbwuGd1WPCxkn4","name":"Unit 1 – Descriptive Statistics","emoji":"📚","slug":"unit-1","hasResources":true,"resources":[{"id":"1O4EJ8pgTtT3hwre","title":"1.2 Measures of variability","slug":"measures-variability","type":"STUDY_GUIDE","date":null},{"id":"AShdABuaEPHvUkOX","title":"1.3 Data visualization techniques","slug":"data-visualization-techniques","type":"STUDY_GUIDE","date":null},{"id":"SJFSIp89wbjNlmXy","title":"1.4 Frequency distributions","slug":"frequency-distributions","type":"STUDY_GUIDE","date":null},{"id":"zS1nT2lH15ARMEao","title":"1.1 Measures of central tendency","slug":"measures-central-tendency","type":"STUDY_GUIDE","date":null},{"id":"7k8iyFWgRaojOQfV","title":"1.5 Percentiles and quartiles","slug":"percentiles-quartiles","type":"STUDY_GUIDE","date":null}]},{"id":"aJKTALy4JO4Ha2HP","name":"Unit 2 – Probability Theory","emoji":"📚","slug":"unit-2","hasResources":true,"resources":[{"id":"78qVzpxfnnqB0ffq","title":"2.5 Random variables","slug":"random-variables","type":"STUDY_GUIDE","date":null},{"id":"BU58gyLMfFXIuB3C","title":"2.1 Basic probability concepts","slug":"basic-probability-concepts","type":"STUDY_GUIDE","date":null},{"id":"5z4EKgr8lbFrtIP9","title":"2.2 Probability distributions","slug":"probability-distributions","type":"STUDY_GUIDE","date":null},{"id":"msSAV8mbsAWYevRE","title":"2.3 Conditional probability","slug":"conditional-probability","type":"STUDY_GUIDE","date":null},{"id":"bpMo0NUQ16NeHesD","title":"2.4 Bayes' theorem","slug":"bayes-theorem","type":"STUDY_GUIDE","date":null}]},{"id":"b1830ayB02ogOzzO","name":"Unit 3 – Sampling Distributions","emoji":"📚","slug":"unit-3","hasResources":true,"resources":[{"id":"Z9UiEYxDfIVxibiI","title":"3.1 Central Limit Theorem","slug":"central-limit-theorem","type":"STUDY_GUIDE","date":null},{"id":"DH7vkKyKkP1nOHyQ","title":"3.2 Standard error","slug":"standard-error","type":"STUDY_GUIDE","date":null},{"id":"6OREsMCeFCfhONQ3","title":"3.3 Sampling distribution of the mean","slug":"sampling-distribution","type":"STUDY_GUIDE","date":null},{"id":"1G7IUwO0pJEmakGw","title":"3.4 Sampling distribution of the proportion","slug":"sampling-distribution-proportion","type":"STUDY_GUIDE","date":null},{"id":"xtzdxxU3t8kDuZhX","title":"3.5 T-distribution","slug":"t-distribution","type":"STUDY_GUIDE","date":null}]},{"id":"mFxTb3hBEGL2TIbD","name":"Unit 4 – Hypothesis Testing","emoji":"📚","slug":"unit-4","hasResources":true,"resources":[{"id":"sCtn0fY8EqV54NyF","title":"4.3 P-values","slug":"p-values","type":"STUDY_GUIDE","date":null},{"id":"e8FiRZIVBSyn3ZsQ","title":"4.1 Null and alternative hypotheses","slug":"null-alternative-hypotheses","type":"STUDY_GUIDE","date":null},{"id":"4HFN174Cffq1h1dl","title":"4.2 Type I and Type II errors","slug":"type-type-ii-errors","type":"STUDY_GUIDE","date":null},{"id":"p51GnPotzpReA1YZ","title":"4.4 Statistical power","slug":"statistical-power","type":"STUDY_GUIDE","date":null},{"id":"OMTloyn2JzB4y9sj","title":"4.5 One-sample tests","slug":"one-sample-tests","type":"STUDY_GUIDE","date":null},{"id":"CCihz0QBeycja5xI","title":"4.6 Two-sample tests","slug":"two-sample-tests","type":"STUDY_GUIDE","date":null}]},{"id":"5HqBzzlN71HHuIbr","name":"Unit 5 – Confidence Intervals in Biostatistics","emoji":"📚","slug":"unit-5","hasResources":true,"resources":[{"id":"yzoYEpYaMr6OybfI","title":"5.3 Confidence interval for the difference between means","slug":"confidence-interval-difference-means","type":"STUDY_GUIDE","date":null},{"id":"cn0l264Li1Lhmo6l","title":"5.1 Confidence interval for the mean","slug":"confidence-interval","type":"STUDY_GUIDE","date":null},{"id":"L5iQST0TYsoVUt4Y","title":"5.2 Confidence interval for the proportion","slug":"confidence-interval-proportion","type":"STUDY_GUIDE","date":null},{"id":"t3SOYhz7VsHCNY7a","title":"5.4 Confidence interval for the difference between proportions","slug":"confidence-interval-difference-proportions","type":"STUDY_GUIDE","date":null},{"id":"beytRL0iOZSeEzPK","title":"5.5 Interpreting confidence intervals","slug":"interpreting-confidence-intervals","type":"STUDY_GUIDE","date":null}]},{"id":"lrOzUV5QUEYM8Oz8","name":"Unit 6 – Regression Analysis","emoji":"📚","slug":"unit-6","hasResources":true,"resources":[{"id":"9STvcq6HC9ZMpi1B","title":"6.3 Logistic regression","slug":"logistic-regression","type":"STUDY_GUIDE","date":null},{"id":"mkiUjWxLZuIXeFcb","title":"6.1 Simple linear regression","slug":"simple-linear-regression","type":"STUDY_GUIDE","date":null},{"id":"dalNtc1LwM1X1VfS","title":"6.2 Multiple linear regression","slug":"multiple-linear-regression","type":"STUDY_GUIDE","date":null},{"id":"ibO4uUwy3WwUuSjw","title":"6.4 Model diagnostics","slug":"model-diagnostics","type":"STUDY_GUIDE","date":null},{"id":"wFuxzd24hwhbAiXv","title":"6.5 Correlation analysis","slug":"correlation-analysis","type":"STUDY_GUIDE","date":null}]},{"id":"pmM6iTkJlTrbCJhp","name":"Unit 7 – Analysis of Variance (ANOVA)","emoji":"📚","slug":"unit-7","hasResources":true,"resources":[{"id":"YOlFtb7aiCZsGamU","title":"7.5 Assumptions and diagnostics","slug":"assumptions-diagnostics","type":"STUDY_GUIDE","date":null},{"id":"60h4kp5pU0UzVNz7","title":"7.2 Two-way ANOVA","slug":"two-way-anova","type":"STUDY_GUIDE","date":null},{"id":"B6Y0sj2UQAFyRXEV","title":"7.3 Repeated measures ANOVA","slug":"repeated-measures-anova","type":"STUDY_GUIDE","date":null},{"id":"igNe9jDwwC8mMu8r","title":"7.4 Post-hoc tests","slug":"post-hoc-tests","type":"STUDY_GUIDE","date":null},{"id":"RfAD16ZrOUhh1kqw","title":"7.1 One-way ANOVA","slug":"one-way-anova","type":"STUDY_GUIDE","date":null}]},{"id":"53U9MGSYxB66PLbg","name":"Unit 8 – Experimental Design","emoji":"📚","slug":"unit-8","hasResources":true,"resources":[{"id":"pFXBLuu48ndsErJT","title":"8.1 Randomization","slug":"randomization","type":"STUDY_GUIDE","date":null},{"id":"PtDgoky63gDBaOrp","title":"8.2 Blinding","slug":"blinding","type":"STUDY_GUIDE","date":null},{"id":"PtNWzA6JlwMX79DD","title":"8.5 Factorial designs","slug":"factorial-designs","type":"STUDY_GUIDE","date":null},{"id":"nq1omVcRclG3w7vG","title":"8.3 Control groups","slug":"control-groups","type":"STUDY_GUIDE","date":null},{"id":"IgNoPSwD4JdUPNzl","title":"8.4 Sample size determination","slug":"sample-size-determination","type":"STUDY_GUIDE","date":null}]},{"id":"kVVMucz579uvt2r7","name":"Unit 9 – Survival Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"e4D2tkFWN44Q8fNq","title":"9.1 Kaplan-Meier estimator","slug":"kaplan-meier-estimator","type":"STUDY_GUIDE","date":null},{"id":"HGGR4TjJqb7DiHQy","title":"9.3 Censoring","slug":"censoring","type":"STUDY_GUIDE","date":null},{"id":"tbsbuwbhiTDaFU6Z","title":"9.5 Hazard ratios","slug":"hazard-ratios","type":"STUDY_GUIDE","date":null},{"id":"EfuaY8jSNC9ZUNSr","title":"9.4 Log-rank test","slug":"log-rank-test","type":"STUDY_GUIDE","date":null},{"id":"IiPy3TSXkudZ28cm","title":"9.2 Cox proportional hazards model","slug":"cox-proportional-hazards-model","type":"STUDY_GUIDE","date":null}]},{"id":"FD86VDr2IzTFDYGz","name":"Unit 10 – Epidemiological Measures","emoji":"📚","slug":"unit-10","hasResources":true,"resources":[{"id":"lpPmnlnjuaEDOqrx","title":"10.2 Relative risk","slug":"relative-risk","type":"STUDY_GUIDE","date":null},{"id":"OaxQ7C81fpnqbwej","title":"10.4 Sensitivity and specificity","slug":"sensitivity-specificity","type":"STUDY_GUIDE","date":null},{"id":"K0eQFC2DjGgjdIbx","title":"10.1 Incidence and prevalence","slug":"incidence-prevalence","type":"STUDY_GUIDE","date":null},{"id":"4hpy9rn5TvyC9J8w","title":"10.3 Odds ratio","slug":"odds-ratio","type":"STUDY_GUIDE","date":null},{"id":"EsfutmLq0YnqH0xX","title":"10.5 Attributable risk","slug":"attributable-risk","type":"STUDY_GUIDE","date":null}]},{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"uuCg3qOSetbDAukb","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","type":"STUDY_GUIDE","date":null},{"id":"ECob8I1hDH2SslWt","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","type":"STUDY_GUIDE","date":null},{"id":"fltCwHwrmxFkPzLD","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","type":"STUDY_GUIDE","date":null},{"id":"UdImkGkjLzRBsQ7s","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","type":"STUDY_GUIDE","date":null},{"id":"X9OMg6UVm28mXVLC","title":"11.3 Data visualization tools","slug":"data-visualization-tools","type":"STUDY_GUIDE","date":null}]}],"activeUnit":{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"uuCg3qOSetbDAukb","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","type":"STUDY_GUIDE","date":null},{"id":"ECob8I1hDH2SslWt","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","type":"STUDY_GUIDE","date":null},{"id":"fltCwHwrmxFkPzLD","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","type":"STUDY_GUIDE","date":null},{"id":"UdImkGkjLzRBsQ7s","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","type":"STUDY_GUIDE","date":null},{"id":"X9OMg6UVm28mXVLC","title":"11.3 Data visualization tools","slug":"data-visualization-tools","type":"STUDY_GUIDE","date":null}]},"activeSubject":{"id":"introduction-to-biostatistics","name":"Intro to Biostatistics","emoji":"🫁","slug":"introduction-to-biostatistics","active":true,"keyTermsActive":null,"category":"Social Science","hasCalculators":false,"hasKeyTerms":true,"hasPracticeQuestions":false,"units":[{"id":"3JKbwuGd1WPCxkn4","name":"Unit 1 – Descriptive Statistics","emoji":"📚","slug":"unit-1","hasResources":true,"resources":[{"id":"1O4EJ8pgTtT3hwre","title":"1.2 Measures of variability","slug":"measures-variability","type":"STUDY_GUIDE","date":null},{"id":"AShdABuaEPHvUkOX","title":"1.3 Data visualization techniques","slug":"data-visualization-techniques","type":"STUDY_GUIDE","date":null},{"id":"SJFSIp89wbjNlmXy","title":"1.4 Frequency distributions","slug":"frequency-distributions","type":"STUDY_GUIDE","date":null},{"id":"zS1nT2lH15ARMEao","title":"1.1 Measures of central tendency","slug":"measures-central-tendency","type":"STUDY_GUIDE","date":null},{"id":"7k8iyFWgRaojOQfV","title":"1.5 Percentiles and quartiles","slug":"percentiles-quartiles","type":"STUDY_GUIDE","date":null}]},{"id":"aJKTALy4JO4Ha2HP","name":"Unit 2 – Probability Theory","emoji":"📚","slug":"unit-2","hasResources":true,"resources":[{"id":"78qVzpxfnnqB0ffq","title":"2.5 Random variables","slug":"random-variables","type":"STUDY_GUIDE","date":null},{"id":"BU58gyLMfFXIuB3C","title":"2.1 Basic probability concepts","slug":"basic-probability-concepts","type":"STUDY_GUIDE","date":null},{"id":"5z4EKgr8lbFrtIP9","title":"2.2 Probability distributions","slug":"probability-distributions","type":"STUDY_GUIDE","date":null},{"id":"msSAV8mbsAWYevRE","title":"2.3 Conditional probability","slug":"conditional-probability","type":"STUDY_GUIDE","date":null},{"id":"bpMo0NUQ16NeHesD","title":"2.4 Bayes' theorem","slug":"bayes-theorem","type":"STUDY_GUIDE","date":null}]},{"id":"b1830ayB02ogOzzO","name":"Unit 3 – Sampling Distributions","emoji":"📚","slug":"unit-3","hasResources":true,"resources":[{"id":"Z9UiEYxDfIVxibiI","title":"3.1 Central Limit Theorem","slug":"central-limit-theorem","type":"STUDY_GUIDE","date":null},{"id":"DH7vkKyKkP1nOHyQ","title":"3.2 Standard error","slug":"standard-error","type":"STUDY_GUIDE","date":null},{"id":"6OREsMCeFCfhONQ3","title":"3.3 Sampling distribution of the mean","slug":"sampling-distribution","type":"STUDY_GUIDE","date":null},{"id":"1G7IUwO0pJEmakGw","title":"3.4 Sampling distribution of the proportion","slug":"sampling-distribution-proportion","type":"STUDY_GUIDE","date":null},{"id":"xtzdxxU3t8kDuZhX","title":"3.5 T-distribution","slug":"t-distribution","type":"STUDY_GUIDE","date":null}]},{"id":"mFxTb3hBEGL2TIbD","name":"Unit 4 – Hypothesis Testing","emoji":"📚","slug":"unit-4","hasResources":true,"resources":[{"id":"sCtn0fY8EqV54NyF","title":"4.3 P-values","slug":"p-values","type":"STUDY_GUIDE","date":null},{"id":"e8FiRZIVBSyn3ZsQ","title":"4.1 Null and alternative hypotheses","slug":"null-alternative-hypotheses","type":"STUDY_GUIDE","date":null},{"id":"4HFN174Cffq1h1dl","title":"4.2 Type I and Type II errors","slug":"type-type-ii-errors","type":"STUDY_GUIDE","date":null},{"id":"p51GnPotzpReA1YZ","title":"4.4 Statistical power","slug":"statistical-power","type":"STUDY_GUIDE","date":null},{"id":"OMTloyn2JzB4y9sj","title":"4.5 One-sample tests","slug":"one-sample-tests","type":"STUDY_GUIDE","date":null},{"id":"CCihz0QBeycja5xI","title":"4.6 Two-sample tests","slug":"two-sample-tests","type":"STUDY_GUIDE","date":null}]},{"id":"5HqBzzlN71HHuIbr","name":"Unit 5 – Confidence Intervals in Biostatistics","emoji":"📚","slug":"unit-5","hasResources":true,"resources":[{"id":"yzoYEpYaMr6OybfI","title":"5.3 Confidence interval for the difference between means","slug":"confidence-interval-difference-means","type":"STUDY_GUIDE","date":null},{"id":"cn0l264Li1Lhmo6l","title":"5.1 Confidence interval for the mean","slug":"confidence-interval","type":"STUDY_GUIDE","date":null},{"id":"L5iQST0TYsoVUt4Y","title":"5.2 Confidence interval for the proportion","slug":"confidence-interval-proportion","type":"STUDY_GUIDE","date":null},{"id":"t3SOYhz7VsHCNY7a","title":"5.4 Confidence interval for the difference between proportions","slug":"confidence-interval-difference-proportions","type":"STUDY_GUIDE","date":null},{"id":"beytRL0iOZSeEzPK","title":"5.5 Interpreting confidence intervals","slug":"interpreting-confidence-intervals","type":"STUDY_GUIDE","date":null}]},{"id":"lrOzUV5QUEYM8Oz8","name":"Unit 6 – Regression Analysis","emoji":"📚","slug":"unit-6","hasResources":true,"resources":[{"id":"9STvcq6HC9ZMpi1B","title":"6.3 Logistic regression","slug":"logistic-regression","type":"STUDY_GUIDE","date":null},{"id":"mkiUjWxLZuIXeFcb","title":"6.1 Simple linear regression","slug":"simple-linear-regression","type":"STUDY_GUIDE","date":null},{"id":"dalNtc1LwM1X1VfS","title":"6.2 Multiple linear regression","slug":"multiple-linear-regression","type":"STUDY_GUIDE","date":null},{"id":"ibO4uUwy3WwUuSjw","title":"6.4 Model diagnostics","slug":"model-diagnostics","type":"STUDY_GUIDE","date":null},{"id":"wFuxzd24hwhbAiXv","title":"6.5 Correlation analysis","slug":"correlation-analysis","type":"STUDY_GUIDE","date":null}]},{"id":"pmM6iTkJlTrbCJhp","name":"Unit 7 – Analysis of Variance (ANOVA)","emoji":"📚","slug":"unit-7","hasResources":true,"resources":[{"id":"YOlFtb7aiCZsGamU","title":"7.5 Assumptions and diagnostics","slug":"assumptions-diagnostics","type":"STUDY_GUIDE","date":null},{"id":"60h4kp5pU0UzVNz7","title":"7.2 Two-way ANOVA","slug":"two-way-anova","type":"STUDY_GUIDE","date":null},{"id":"B6Y0sj2UQAFyRXEV","title":"7.3 Repeated measures ANOVA","slug":"repeated-measures-anova","type":"STUDY_GUIDE","date":null},{"id":"igNe9jDwwC8mMu8r","title":"7.4 Post-hoc tests","slug":"post-hoc-tests","type":"STUDY_GUIDE","date":null},{"id":"RfAD16ZrOUhh1kqw","title":"7.1 One-way ANOVA","slug":"one-way-anova","type":"STUDY_GUIDE","date":null}]},{"id":"53U9MGSYxB66PLbg","name":"Unit 8 – Experimental Design","emoji":"📚","slug":"unit-8","hasResources":true,"resources":[{"id":"pFXBLuu48ndsErJT","title":"8.1 Randomization","slug":"randomization","type":"STUDY_GUIDE","date":null},{"id":"PtDgoky63gDBaOrp","title":"8.2 Blinding","slug":"blinding","type":"STUDY_GUIDE","date":null},{"id":"PtNWzA6JlwMX79DD","title":"8.5 Factorial designs","slug":"factorial-designs","type":"STUDY_GUIDE","date":null},{"id":"nq1omVcRclG3w7vG","title":"8.3 Control groups","slug":"control-groups","type":"STUDY_GUIDE","date":null},{"id":"IgNoPSwD4JdUPNzl","title":"8.4 Sample size determination","slug":"sample-size-determination","type":"STUDY_GUIDE","date":null}]},{"id":"kVVMucz579uvt2r7","name":"Unit 9 – Survival Analysis","emoji":"📚","slug":"unit-9","hasResources":true,"resources":[{"id":"e4D2tkFWN44Q8fNq","title":"9.1 Kaplan-Meier estimator","slug":"kaplan-meier-estimator","type":"STUDY_GUIDE","date":null},{"id":"HGGR4TjJqb7DiHQy","title":"9.3 Censoring","slug":"censoring","type":"STUDY_GUIDE","date":null},{"id":"tbsbuwbhiTDaFU6Z","title":"9.5 Hazard ratios","slug":"hazard-ratios","type":"STUDY_GUIDE","date":null},{"id":"EfuaY8jSNC9ZUNSr","title":"9.4 Log-rank test","slug":"log-rank-test","type":"STUDY_GUIDE","date":null},{"id":"IiPy3TSXkudZ28cm","title":"9.2 Cox proportional hazards model","slug":"cox-proportional-hazards-model","type":"STUDY_GUIDE","date":null}]},{"id":"FD86VDr2IzTFDYGz","name":"Unit 10 – Epidemiological Measures","emoji":"📚","slug":"unit-10","hasResources":true,"resources":[{"id":"lpPmnlnjuaEDOqrx","title":"10.2 Relative risk","slug":"relative-risk","type":"STUDY_GUIDE","date":null},{"id":"OaxQ7C81fpnqbwej","title":"10.4 Sensitivity and specificity","slug":"sensitivity-specificity","type":"STUDY_GUIDE","date":null},{"id":"K0eQFC2DjGgjdIbx","title":"10.1 Incidence and prevalence","slug":"incidence-prevalence","type":"STUDY_GUIDE","date":null},{"id":"4hpy9rn5TvyC9J8w","title":"10.3 Odds ratio","slug":"odds-ratio","type":"STUDY_GUIDE","date":null},{"id":"EsfutmLq0YnqH0xX","title":"10.5 Attributable risk","slug":"attributable-risk","type":"STUDY_GUIDE","date":null}]},{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","hasResources":true,"resources":[{"id":"uuCg3qOSetbDAukb","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","type":"STUDY_GUIDE","date":null},{"id":"ECob8I1hDH2SslWt","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","type":"STUDY_GUIDE","date":null},{"id":"fltCwHwrmxFkPzLD","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","type":"STUDY_GUIDE","date":null},{"id":"UdImkGkjLzRBsQ7s","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","type":"STUDY_GUIDE","date":null},{"id":"X9OMg6UVm28mXVLC","title":"11.3 Data visualization tools","slug":"data-visualization-tools","type":"STUDY_GUIDE","date":null}]}]}},"subjectBySlug":{"id":"introduction-to-biostatistics","name":"Intro to Biostatistics","branch":"Social Science","keyTermsActive":null,"subBranches":[{"name":"Public Health"}],"description":"## What do you learn in Introduction to Biostatistics\n\nBiostatistics blends biology, math, and statistics to analyze health-related data. You'll learn about study design, data collection methods, and statistical techniques used in public health research. The course covers probability theory, hypothesis testing, regression analysis, and interpreting medical literature. You'll also get hands-on experience with statistical software to crunch numbers and make sense of health trends.\n\n## Is Introduction to Biostatistics hard?\n\nIt can be challenging, especially if you're not a math whiz. The concepts aren't rocket science, but there's a lot to wrap your head around. Some students find the statistical formulas and software intimidating at first. But don't panic - with practice and a good study routine, most people get the hang of it. The key is to stay on top of the material and not fall behind.\n\n## Tips for taking Introduction to Biostatistics in college\n\n1. Use [Fiveable Study Guides](https://fiveable.me/cram-mode) to help you cram 🌶️\n2. Practice, practice, practice - work through lots of problem sets\n3. Form a study group to tackle tricky concepts together\n4. Don't just memorize formulas, understand the logic behind them\n5. Get comfortable with statistical software early on\n6. Relate the material to real-world health issues to make it more engaging\n7. Watch \"Moneyball\" to see how statistics can be applied in unexpected ways\n8. Read \"The Immortal Life of Henrietta Lacks\" for a fascinating look at ethics in medical research\n\n## Common pre-requisites for Introduction to Biostatistics\n\n1. Introductory Statistics: This course covers basic statistical concepts and methods. You'll learn about descriptive statistics, probability distributions, and hypothesis testing.\n\n2. Calculus I: This class introduces differential and integral calculus. It's essential for understanding more advanced statistical concepts and mathematical modeling in biostatistics.\n\n## Classes similar to Introduction to Biostatistics\n\n1. Epidemiology: This course focuses on the distribution and determinants of health-related events in populations. You'll learn about disease patterns, risk factors, and how to design and interpret epidemiological studies.\n\n2. Data Analysis for Public Health: This class teaches you how to wrangle, analyze, and visualize health data. You'll use statistical software to explore real-world public health datasets and draw meaningful conclusions.\n\n3. Research Methods in Public Health: This course covers the fundamentals of designing and conducting public health research. You'll learn about different study designs, sampling techniques, and how to critically evaluate scientific literature.\n\n## Majors related to Introduction to Biostatistics\n\n1. Public Health: Focuses on promoting health and preventing disease at the population level. Students learn about epidemiology, health policy, and environmental health.\n\n2. Biostatistics: Combines statistical theory with biological and health sciences. Students develop skills in data analysis, study design, and statistical modeling for medical research.\n\n3. Epidemiology: Concentrates on studying patterns, causes, and effects of health conditions in populations. Students learn to investigate disease outbreaks and assess public health interventions.\n\n4. Health Informatics: Merges healthcare, information technology, and data science. Students learn to manage and analyze health data to improve patient care and healthcare systems.\n\n## What can you do with a degree in Introduction to Biostatistics?\n\n1. Biostatistician: Designs studies and analyzes data for medical research. They collaborate with scientists and healthcare professionals to interpret results and draw conclusions.\n\n2. Clinical Data Analyst: Works with healthcare providers to analyze patient data and improve clinical outcomes. They use statistical methods to identify trends and patterns in medical records.\n\n3. Epidemiologist: Investigates patterns and causes of diseases in populations. They design and conduct studies to track public health issues and develop strategies for prevention and control.\n\n4. Health Data Scientist: Applies advanced analytics and machine learning to large healthcare datasets. They develop predictive models and data-driven solutions to improve patient care and health outcomes.\n\n## Introduction to Biostatistics FAQs\n\n1. Do I need to be a math genius to succeed in this course? Not at all, but you should be comfortable with basic algebra and willing to put in the effort to learn new concepts.\n\n2. Will we use any specific software in this class? Most likely, you'll use statistical software like R, SAS, or SPSS. Don't worry if you're new to these - you'll learn as you go.\n\n3. How is biostatistics different from regular statistics? Biostatistics applies statistical methods specifically to biological and health-related data. You'll focus on examples and applications in medicine and public health.","emoji":"🫁","order":null,"numResources":null,"active":true,"slug":"introduction-to-biostatistics","generationMetadata":{"group":"Group 9 – parent key terms first","level":"college undergraduate","branch":"Social Science","duration":"one semester","subBranch":null,"lengthVariant":"less text","model":"sonnet"}},"pageParams":{"communitySlug":"introduction-to-biostatistics","unitSlug":"unit-11"},"children":["$","$L1c",null,{"subject":{"name":"Intro to Biostatistics","emoji":"🫁","slug":"introduction-to-biostatistics","category":"Social Science","active":true,"keyTermsActive":null,"generationMetadata":{"group":"Group 9 – parent key terms first","level":"college undergraduate","branch":"Social Science","duration":"one semester","subBranch":null,"lengthVariant":"less text","model":"sonnet"},"id":"introduction-to-biostatistics","order":null,"numResources":null,"description":"## What do you learn in Introduction to Biostatistics\n\nBiostatistics blends biology, math, and statistics to analyze health-related data. You'll learn about study design, data collection methods, and statistical techniques used in public health research. The course covers probability theory, hypothesis testing, regression analysis, and interpreting medical literature. You'll also get hands-on experience with statistical software to crunch numbers and make sense of health trends.\n\n## Is Introduction to Biostatistics hard?\n\nIt can be challenging, especially if you're not a math whiz. The concepts aren't rocket science, but there's a lot to wrap your head around. Some students find the statistical formulas and software intimidating at first. But don't panic - with practice and a good study routine, most people get the hang of it. The key is to stay on top of the material and not fall behind.\n\n## Tips for taking Introduction to Biostatistics in college\n\n1. Use [Fiveable Study Guides](https://fiveable.me/cram-mode) to help you cram 🌶️\n2. Practice, practice, practice - work through lots of problem sets\n3. Form a study group to tackle tricky concepts together\n4. Don't just memorize formulas, understand the logic behind them\n5. Get comfortable with statistical software early on\n6. Relate the material to real-world health issues to make it more engaging\n7. Watch \"Moneyball\" to see how statistics can be applied in unexpected ways\n8. Read \"The Immortal Life of Henrietta Lacks\" for a fascinating look at ethics in medical research\n\n## Common pre-requisites for Introduction to Biostatistics\n\n1. Introductory Statistics: This course covers basic statistical concepts and methods. You'll learn about descriptive statistics, probability distributions, and hypothesis testing.\n\n2. Calculus I: This class introduces differential and integral calculus. It's essential for understanding more advanced statistical concepts and mathematical modeling in biostatistics.\n\n## Classes similar to Introduction to Biostatistics\n\n1. Epidemiology: This course focuses on the distribution and determinants of health-related events in populations. You'll learn about disease patterns, risk factors, and how to design and interpret epidemiological studies.\n\n2. Data Analysis for Public Health: This class teaches you how to wrangle, analyze, and visualize health data. You'll use statistical software to explore real-world public health datasets and draw meaningful conclusions.\n\n3. Research Methods in Public Health: This course covers the fundamentals of designing and conducting public health research. You'll learn about different study designs, sampling techniques, and how to critically evaluate scientific literature.\n\n## Majors related to Introduction to Biostatistics\n\n1. Public Health: Focuses on promoting health and preventing disease at the population level. Students learn about epidemiology, health policy, and environmental health.\n\n2. Biostatistics: Combines statistical theory with biological and health sciences. Students develop skills in data analysis, study design, and statistical modeling for medical research.\n\n3. Epidemiology: Concentrates on studying patterns, causes, and effects of health conditions in populations. Students learn to investigate disease outbreaks and assess public health interventions.\n\n4. Health Informatics: Merges healthcare, information technology, and data science. Students learn to manage and analyze health data to improve patient care and healthcare systems.\n\n## What can you do with a degree in Introduction to Biostatistics?\n\n1. Biostatistician: Designs studies and analyzes data for medical research. They collaborate with scientists and healthcare professionals to interpret results and draw conclusions.\n\n2. Clinical Data Analyst: Works with healthcare providers to analyze patient data and improve clinical outcomes. They use statistical methods to identify trends and patterns in medical records.\n\n3. Epidemiologist: Investigates patterns and causes of diseases in populations. They design and conduct studies to track public health issues and develop strategies for prevention and control.\n\n4. Health Data Scientist: Applies advanced analytics and machine learning to large healthcare datasets. They develop predictive models and data-driven solutions to improve patient care and health outcomes.\n\n## Introduction to Biostatistics FAQs\n\n1. Do I need to be a math genius to succeed in this course? Not at all, but you should be comfortable with basic algebra and willing to put in the effort to learn new concepts.\n\n2. Will we use any specific software in this class? Most likely, you'll use statistical software like R, SAS, or SPSS. Don't worry if you're new to these - you'll learn as you go.\n\n3. How is biostatistics different from regular statistics? Biostatistics applies statistical methods specifically to biological and health-related data. You'll focus on examples and applications in medicine and public health.","meta":{"title":"Intro to Biostatistics - Notes and Study Guides","description":"Study guides with what you need to know for your class on Intro to Biostatistics. Ace your next test."},"units":[{"id":"3JKbwuGd1WPCxkn4","name":"Unit 1 – Descriptive Statistics","emoji":"📚","slug":"unit-1","description":"Unit 1: Descriptive Statistics","intro":"Descriptive statistics form the foundation of data analysis in biomedical research. These methods organize, summarize, and present data, enabling researchers to extract meaningful insights from complex datasets. Understanding key concepts like population parameters, sample statistics, and data types is crucial for effective analysis.\n\nCentral tendency measures, variability metrics, and data visualization techniques are essential tools in the statistician's toolkit. These methods help researchers identify patterns, assess relationships between variables, and communicate findings effectively. Proper interpretation of descriptive statistics is vital for drawing accurate conclusions and avoiding common pitfalls in biomedical research.","overview":"## Key Concepts and Definitions\n- Descriptive statistics involves methods for organizing, summarizing, and presenting data in a meaningful way\n- Population refers to the entire group of individuals, objects, or events of interest while a sample is a subset of the population used for analysis\n- Parameters are numerical values that describe characteristics of a population (usually unknown) while statistics are numerical values calculated from sample data to estimate population parameters\n- Qualitative (categorical) data consists of non-numerical attributes or categories (gender, blood type) while quantitative (numerical) data represents measurements or counts (height, age)\n- Discrete quantitative data can only take on specific values, often integers (number of siblings) while continuous quantitative data can take on any value within a range (weight, temperature)\n- Univariate analysis examines one variable at a time while bivariate analysis explores relationships between two variables\n- Frequency distributions organize and summarize data by counting the occurrences of each value or category\n - Relative frequency is the proportion of observations in each category, calculated by dividing the frequency by the total number of observations\n\n## Types of Data and Variables\n- Nominal data consists of categories with no inherent order or ranking (race, religion)\n - Dichotomous (binary) variables have only two possible categories (alive/dead, yes/no)\n- Ordinal data has categories with a natural order or ranking, but differences between categories are not necessarily equal (socioeconomic status, pain severity)\n- Interval data has ordered categories with equal intervals between values, but no true zero point (temperature in Celsius or Fahrenheit)\n- Ratio data has ordered categories, equal intervals, and a true zero point representing the absence of the variable (height, weight, income)\n- Independent variables (predictors) are manipulated or controlled to observe their effect on dependent variables (outcomes)\n- Confounding variables are related to both the independent and dependent variables, potentially influencing the observed relationship between them (age, smoking status)\n- Effect modifiers (interaction terms) change the magnitude or direction of the relationship between an independent and dependent variable at different levels of the modifier (gender, genetic factors)\n\n## Measures of Central Tendency\n- Mean (arithmetic average) is the sum of all values divided by the number of observations, sensitive to extreme values (outliers)\n- Median is the middle value when data is ordered from lowest to highest, robust to outliers and suitable for skewed distributions\n - For an even number of observations, the median is the average of the two middle values\n- Mode is the most frequently occurring value in a dataset, useful for describing categorical or discrete data\n- Geometric mean is calculated by multiplying all values and taking the nth root (where n is the number of observations), used for positively skewed data or ratios\n- Harmonic mean is the reciprocal of the arithmetic mean of reciprocals, used for rates or ratios (average speed, drug clearance rates)\n- Weighted mean accounts for the relative importance of each value by assigning weights, used when some observations are more influential than others\n\n## Measures of Variability\n- Range is the difference between the maximum and minimum values, providing a simple measure of dispersion\n- Interquartile range (IQR) is the difference between the 75th and 25th percentiles (Q3 - Q1), a robust measure of dispersion less sensitive to outliers\n- Variance is the average squared deviation from the mean, quantifying how far observations are from the center\n - Sample variance (s²) has a denominator of n-1 to account for the loss of one degree of freedom when estimating the population variance\n- Standard deviation is the square root of the variance, expressing dispersion in the same units as the original data\n- Coefficient of variation (CV) is the ratio of the standard deviation to the mean, allowing comparison of variability across variables with different units or scales\n- Standard error of the mean (SEM) estimates the variability of the sample mean, calculated as the standard deviation divided by the square root of the sample size\n - Smaller SEM indicates more precise estimates of the population mean\n\n## Data Visualization Techniques\n- Histograms display the frequency distribution of continuous data using adjacent rectangles, with the area of each rectangle proportional to the frequency of observations in that bin\n- Bar charts compare frequencies or proportions of categorical data using separate rectangles, with the height of each bar representing the frequency or proportion\n- Pie charts show the relative frequencies of categorical data as slices of a circle, with the area of each slice proportional to the frequency or proportion\n- Box plots (box-and-whisker plots) summarize the distribution of continuous data using five summary statistics (minimum, Q1, median, Q3, maximum) and identify outliers\n- Scatter plots display the relationship between two continuous variables, with each point representing an observation and its coordinates corresponding to the values of the variables\n- Line graphs connect data points in order, often used to show trends or changes over time\n- Heat maps use color intensity to represent the magnitude of a variable across two dimensions (e.g., gene expression levels across samples and conditions)\n\n## Interpreting Descriptive Statistics\n- Assess the shape of the distribution (symmetric, skewed, bimodal) to select appropriate summary statistics and statistical tests\n - Skewed distributions have a long tail on one side and may require non-parametric methods or data transformations\n- Consider the presence of outliers, which can greatly influence the mean and standard deviation but have less impact on the median and IQR\n- Use measures of central tendency to describe the typical or representative value in a dataset\n - The mean is often used for normally distributed data, while the median is preferred for skewed distributions or when outliers are present\n- Employ measures of variability to quantify the spread or dispersion of the data, providing context for the central tendency measures\n - A small standard deviation indicates data points are clustered closely around the mean, while a large standard deviation suggests greater variability\n- Interpret the standard error of the mean as a measure of the precision of the sample mean estimate, with smaller values indicating more reliable estimates\n- Utilize data visualization techniques to identify patterns, trends, and relationships between variables, as well as to communicate findings effectively\n\n## Applications in Biomedical Research\n- Summarizing patient characteristics (age, BMI, blood pressure) and comparing across treatment groups in clinical trials\n- Describing the distribution of biomarkers (glucose levels, tumor size) and establishing reference ranges for diagnostic purposes\n- Analyzing epidemiological data (prevalence, incidence rates) to understand disease burden and risk factors in populations\n- Exploring relationships between variables (dose-response curves, genotype-phenotype associations) to generate hypotheses and guide further research\n- Monitoring quality control metrics (assay variability, batch effects) to ensure reliability and reproducibility of experimental results\n- Communicating research findings to diverse audiences (clinicians, policymakers, the public) using clear and informative data visualizations\n\n## Common Pitfalls and Misconceptions\n- Overinterpreting small differences in summary statistics without considering the variability and uncertainty in the data\n- Failing to recognize the limitations of summary statistics, such as the sensitivity of the mean to extreme values or the inability of the median to capture the full range of the data\n- Misusing or misinterpreting the standard error of the mean as a measure of the variability of individual observations rather than the precision of the sample mean estimate\n- Neglecting to assess the assumptions underlying certain statistical methods, such as the normality assumption for parametric tests\n- Confusing statistical significance with practical or clinical significance, as small differences may be statistically significant in large samples but not meaningful in practice\n- Overrelying on p-values and neglecting effect sizes and confidence intervals, which provide more informative measures of the magnitude and precision of the observed effects\n- Failing to account for multiple comparisons when conducting numerous hypothesis tests, increasing the likelihood of Type I errors (false positives)\n- Inappropriately extrapolating findings from a sample to a population without considering the representativeness of the sample and potential sources of bias","active":true,"order":1,"meta":{"title":"Descriptive Statistics | Intro to Biostatistics Class Notes","description":"Study guides to review Descriptive Statistics. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"1O4EJ8pgTtT3hwre","type":"STUDY_GUIDE","title":"1.2 Measures of variability","slug":"measures-variability","date":null,"keyTopics":[],"publicId":"1O4EJ8pgTtT3hwre","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["7kZaAjDQvsyypQRk"],"duration":9},{"id":"AShdABuaEPHvUkOX","type":"STUDY_GUIDE","title":"1.3 Data visualization techniques","slug":"data-visualization-techniques","date":null,"keyTopics":[],"publicId":"AShdABuaEPHvUkOX","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["DIyYurZBmLjjwqIi"],"duration":15},{"id":"SJFSIp89wbjNlmXy","type":"STUDY_GUIDE","title":"1.4 Frequency distributions","slug":"frequency-distributions","date":null,"keyTopics":[],"publicId":"SJFSIp89wbjNlmXy","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["Vh3MbSOalPdDOrn6"],"duration":7},{"id":"zS1nT2lH15ARMEao","type":"STUDY_GUIDE","title":"1.1 Measures of central tendency","slug":"measures-central-tendency","date":null,"keyTopics":[],"publicId":"zS1nT2lH15ARMEao","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["O8pcRGrMhYUdWn5g"],"duration":8},{"id":"7k8iyFWgRaojOQfV","type":"STUDY_GUIDE","title":"1.5 Percentiles and quartiles","slug":"percentiles-quartiles","date":null,"keyTopics":[],"publicId":"7k8iyFWgRaojOQfV","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["9nFh9xJnqIZ5VNDi"],"duration":10}],"numResources":1},{"id":"aJKTALy4JO4Ha2HP","name":"Unit 2 – Probability Theory","emoji":"📚","slug":"unit-2","description":"Unit 2: Probability Theory","intro":"Probability theory forms the foundation of statistical analysis in biomedical research. It provides tools to quantify uncertainty, assess risks, and make informed decisions based on available data. Understanding key concepts like sample spaces, events, and random variables is crucial for interpreting study results.\n\nThis unit covers probability basics, types of probability, and probability distributions. It also explores applications in biostatistics, including diagnostic testing, epidemiology, and clinical trials. Mastering probability calculations and avoiding common pitfalls are essential skills for conducting rigorous statistical analyses in biomedical research.","overview":"## Key Concepts and Definitions\n- Probability the likelihood of an event occurring, expressed as a number between 0 and 1\n - 0 indicates an impossible event, while 1 represents a certain event\n- Sample space the set of all possible outcomes of an experiment or random process\n- Event a subset of the sample space, representing one or more outcomes of interest\n- Random variable a function that assigns a numerical value to each outcome in a sample space\n - Can be discrete (countable values) or continuous (uncountable values)\n- Independence two events are independent if the occurrence of one does not affect the probability of the other\n- Mutually exclusive events cannot occur simultaneously (rolling a 1 and a 6 on a single die roll)\n\n## Probability Basics\n- Probability is calculated by dividing the number of favorable outcomes by the total number of possible outcomes\n - P(A) = (number of favorable outcomes) / (total number of possible outcomes)\n- The sum of probabilities for all possible outcomes in a sample space equals 1\n- Complement of an event (A') probability that event A does not occur, calculated as P(A') = 1 - P(A)\n- Addition rule for mutually exclusive events P(A or B) = P(A) + P(B)\n- Multiplication rule for independent events P(A and B) = P(A) × P(B)\n- Conditional probability the probability of event A occurring given that event B has already occurred, denoted as P(A|B)\n - Calculated as P(A|B) = P(A and B) / P(B)\n\n## Types of Probability\n- Classical probability based on the assumption that all outcomes are equally likely\n - Used in situations with a finite number of equally likely outcomes (fair coin, unbiased die)\n- Empirical (frequentist) probability estimated based on observed data or past experiences\n - Calculated as the relative frequency of an event in a large number of trials\n- Subjective probability based on personal belief or judgment, often used when limited data is available\n- Axiomatic probability follows a set of axioms to ensure consistency and avoid paradoxes\n - Non-negativity P(A) ≥ 0 for all events A\n - Normalization P(S) = 1, where S is the entire sample space\n - Additivity for mutually exclusive events P(A or B) = P(A) + P(B)\n\n## Probability Distributions\n- Probability distribution a function that describes the likelihood of different outcomes for a random variable\n- Discrete probability distributions used for random variables with countable outcomes\n - Examples Bernoulli, binomial, Poisson, geometric distributions\n- Continuous probability distributions used for random variables with uncountable outcomes\n - Examples uniform, normal (Gaussian), exponential, beta distributions\n- Probability density function (PDF) describes the relative likelihood of a continuous random variable taking on a specific value\n- Cumulative distribution function (CDF) gives the probability that a random variable is less than or equal to a specific value\n- Expected value (mean) the average value of a random variable over a large number of trials\n- Variance and standard deviation measures of the spread or dispersion of a probability distribution\n\n## Applications in Biostatistics\n- Diagnostic testing calculating sensitivity, specificity, and predictive values using probability\n - Sensitivity P(positive test | disease), specificity P(negative test | no disease)\n- Epidemiology estimating disease prevalence, incidence, and risk factors using probability methods\n- Genetics calculating the probability of inheriting certain traits or genetic disorders based on Mendelian inheritance\n- Clinical trials determining the probability of treatment success, adverse events, and patient outcomes\n- Survival analysis estimating the probability of survival over time using methods like Kaplan-Meier curves and Cox regression\n- Risk assessment quantifying the probability of developing a disease or experiencing an adverse event based on risk factors\n\n## Probability Calculations\n- Bayes' theorem used to calculate the probability of an event based on prior knowledge and new evidence\n - P(A|B) = (P(B|A) × P(A)) / P(B)\n- Permutations calculate the number of ways to arrange objects in a specific order\n - nPr = n! / (n - r)!, where n is the total number of objects and r is the number of objects being arranged\n- Combinations calculate the number of ways to select objects without regard to order\n - nCr = n! / (r! × (n - r)!), where n is the total number of objects and r is the number of objects being selected\n- Binomial probability calculates the probability of a specific number of successes in a fixed number of independent trials\n - P(X = k) = nCk × p^k × (1 - p)^(n - k), where n is the number of trials, k is the number of successes, and p is the probability of success in a single trial\n- Poisson probability calculates the probability of a specific number of events occurring in a fixed interval of time or space\n - P(X = k) = (λ^k × e^(-λ)) / k!, where λ is the average number of events per interval and k is the number of events of interest\n\n## Common Mistakes and Pitfalls\n- Confusing independence and mutual exclusivity events can be mutually exclusive but not independent, or independent but not mutually exclusive\n- Misinterpreting conditional probability P(A|B) is not always equal to P(B|A)\n- Neglecting the base rate (prior probability) when using Bayes' theorem\n- Misusing the multiplication rule for non-independent events P(A and B) ≠ P(A) × P(B) if A and B are dependent\n- Overestimating the likelihood of rare events based on personal experience or media coverage (availability heuristic)\n- Misinterpreting p-values as the probability of the null hypothesis being true, rather than the probability of observing the data given that the null hypothesis is true\n\n## Real-World Examples\n- Weather forecasting predicting the probability of rain, snow, or other weather events based on historical data and current conditions\n- Insurance calculating premiums based on the probability of claims, considering factors like age, health status, and risk behaviors\n- Quality control estimating the probability of defective products in a manufacturing process to ensure compliance with standards\n- Sports betting determining the odds of different outcomes in a game or tournament based on team statistics and performance\n- Medical decision-making using probability to weigh the risks and benefits of different diagnostic tests or treatment options\n- Finance assessing the probability of investment returns, loan defaults, or market fluctuations to inform financial strategies","active":true,"order":2,"meta":{"title":"Probability Theory | Intro to Biostatistics Class Notes","description":"Study guides to review Probability Theory. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"78qVzpxfnnqB0ffq","type":"STUDY_GUIDE","title":"2.5 Random variables","slug":"random-variables","date":null,"keyTopics":[],"publicId":"78qVzpxfnnqB0ffq","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["8IdetjGewt28tdX4"],"duration":8},{"id":"BU58gyLMfFXIuB3C","type":"STUDY_GUIDE","title":"2.1 Basic probability concepts","slug":"basic-probability-concepts","date":null,"keyTopics":[],"publicId":"BU58gyLMfFXIuB3C","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["3XSRlDHQaGL1Hr0d"],"duration":8},{"id":"5z4EKgr8lbFrtIP9","type":"STUDY_GUIDE","title":"2.2 Probability distributions","slug":"probability-distributions","date":null,"keyTopics":[],"publicId":"5z4EKgr8lbFrtIP9","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["WCAshnpow8GLowHN"],"duration":10},{"id":"msSAV8mbsAWYevRE","type":"STUDY_GUIDE","title":"2.3 Conditional probability","slug":"conditional-probability","date":null,"keyTopics":[],"publicId":"msSAV8mbsAWYevRE","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["fSqoflSZC8GWxS3C"],"duration":6},{"id":"bpMo0NUQ16NeHesD","type":"STUDY_GUIDE","title":"2.4 Bayes' theorem","slug":"bayes-theorem","date":null,"keyTopics":[],"publicId":"bpMo0NUQ16NeHesD","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["d4MY8YTdhKFnrVF6"],"duration":9}],"numResources":1},{"id":"b1830ayB02ogOzzO","name":"Unit 3 – Sampling Distributions","emoji":"📚","slug":"unit-3","description":"Unit 3: Sampling Distributions","intro":"Sampling distributions are crucial in biostatistics, allowing us to make inferences about populations based on sample data. They represent how sample statistics like means or proportions vary across multiple samples, providing insights into population parameters.\n\nThe Central Limit Theorem is key, stating that for large samples, the distribution of sample means approaches normality. This enables the use of standard errors and confidence intervals to estimate population parameters and conduct hypothesis tests in biomedical research.","overview":"## Key Concepts\n- Sampling distribution represents the distribution of a sample statistic over many samples drawn from a population\n- Sample statistic can be the mean, median, proportion, or other descriptive measures of a sample\n- Sampling distributions allow inferring properties of the population based on sample data\n- Shape, center, and spread are key characteristics of sampling distributions\n- Central Limit Theorem states that the sampling distribution of the mean approaches a normal distribution as the sample size increases, regardless of the shape of the population distribution\n- Standard error measures the variability of a sample statistic and decreases as the sample size increases\n- Confidence intervals provide a range of plausible values for a population parameter based on sample data and a specified level of confidence\n- Sampling distributions are the foundation for hypothesis testing and estimation in biostatistics\n\n## Types of Sampling\n- Simple random sampling ensures each member of the population has an equal chance of being selected\n - Reduces bias and allows for generalization to the population\n- Stratified sampling divides the population into subgroups (strata) and samples from each stratum independently\n - Ensures representation of important subgroups and improves precision\n- Cluster sampling involves dividing the population into clusters and randomly selecting entire clusters to sample\n - Useful when a complete list of individuals in the population is not available or when clusters are naturally occurring (hospitals, schools)\n- Systematic sampling selects every kth element from a list of the population\n - Easy to implement but may introduce bias if there is a pattern in the list\n- Convenience sampling selects readily available individuals, often leading to bias and limited generalizability\n- Purposive sampling selects individuals based on specific characteristics or criteria determined by the researcher\n - Useful for studying specific subgroups but limits generalizability\n\n## Sampling Distribution Characteristics\n- Shape of the sampling distribution depends on the sample size and the population distribution\n - For large sample sizes, the sampling distribution of the mean is approximately normal, regardless of the population distribution (Central Limit Theorem)\n- Center of the sampling distribution is equal to the population parameter being estimated\n - For the sampling distribution of the mean, the center is equal to the population mean\n- Spread of the sampling distribution is measured by the standard error\n - Standard error of the mean is equal to the population standard deviation divided by the square root of the sample size\n- Increasing the sample size reduces the standard error and narrows the sampling distribution\n- Skewness and kurtosis of the sampling distribution approach zero as the sample size increases\n- Sampling distributions are theoretical distributions based on repeated sampling from a population\n- Sampling distributions are used to calculate probabilities and make inferences about population parameters\n\n## Central Limit Theorem\n- States that the sampling distribution of the mean approaches a normal distribution as the sample size increases, regardless of the shape of the population distribution\n- Requires a sufficiently large sample size, typically n ≥ 30, for the theorem to hold\n- Allows for the use of normal distribution-based methods for inference, even when the population distribution is not normal\n- The mean of the sampling distribution is equal to the population mean, $\\mu$\n- The standard deviation of the sampling distribution (standard error) is equal to the population standard deviation, $\\sigma$, divided by the square root of the sample size, $\\sqrt{n}$: $\\frac{\\sigma}{\\sqrt{n}}$\n- The Central Limit Theorem is a key concept in inferential statistics and enables the use of parametric tests and confidence intervals\n\n## Standard Error\n- Measures the variability of a sample statistic, such as the mean or proportion\n- For the sample mean, the standard error is calculated as: $\\frac{s}{\\sqrt{n}}$, where $s$ is the sample standard deviation and $n$ is the sample size\n- Smaller standard errors indicate more precise estimates of the population parameter\n- Standard error decreases as the sample size increases, following the inverse square root relationship: $SE \\propto \\frac{1}{\\sqrt{n}}$\n- Used to construct confidence intervals and test hypotheses about population parameters\n- Differs from the standard deviation, which measures the variability of individual observations within a sample\n- Helps determine the minimum sample size needed to achieve a desired level of precision in estimating a population parameter\n\n## Confidence Intervals\n- Provide a range of plausible values for a population parameter based on sample data and a specified level of confidence\n- Confidence level (e.g., 95%) represents the proportion of intervals that would contain the true population parameter if the sampling process were repeated many times\n- For a sample mean, the confidence interval is calculated as: $\\bar{x} \\pm z_{\\alpha/2} \\cdot \\frac{s}{\\sqrt{n}}$, where $\\bar{x}$ is the sample mean, $z_{\\alpha/2}$ is the critical value from the standard normal distribution corresponding to the confidence level, $s$ is the sample standard deviation, and $n$ is the sample size\n- Wider confidence intervals indicate less precision in the estimate, while narrower intervals suggest greater precision\n- Increasing the sample size or decreasing the confidence level will result in narrower confidence intervals\n- Confidence intervals are an essential tool for estimating population parameters and reporting the uncertainty associated with the estimates\n- Interpreting confidence intervals correctly is crucial for making valid inferences about the population\n\n## Applications in Biostatistics\n- Sampling distributions are used to estimate population parameters, such as the mean blood pressure or the proportion of individuals with a specific disease\n- Confidence intervals are used to report the precision of estimates in biomedical research studies\n - Example: The 95% confidence interval for the mean systolic blood pressure in a sample of hypertensive patients is (135 mmHg, 145 mmHg)\n- Hypothesis testing relies on sampling distributions to determine the probability of observing a sample statistic under the null hypothesis\n - Example: Comparing the mean BMI between two groups (treatment and control) to assess the effectiveness of a weight loss intervention\n- Sample size calculations use the standard error and desired precision to determine the minimum number of participants needed in a study\n- Sampling distributions are crucial for interpreting the results of biomedical research and making evidence-based decisions in healthcare\n- Understanding sampling distributions is essential for critically evaluating the validity and generalizability of research findings in biostatistics\n\n## Common Pitfalls and Misconceptions\n- Confusing the standard deviation and standard error\n - Standard deviation measures the variability of individual observations, while standard error measures the variability of a sample statistic\n- Misinterpreting confidence intervals as the probability that the true population parameter lies within the interval\n - Confidence level refers to the proportion of intervals that would contain the true parameter if the sampling process were repeated many times\n- Assuming that a larger sample size always leads to more accurate estimates\n - While larger sample sizes generally improve precision, they may also increase the risk of bias if the sampling method is not appropriate\n- Failing to check the assumptions of the Central Limit Theorem before applying normal distribution-based methods\n - The theorem requires a sufficiently large sample size and independence of observations\n- Overgeneralizing findings from a sample to the entire population without considering the representativeness of the sample\n - Non-random sampling methods may introduce bias and limit generalizability\n- Misinterpreting p-values as the probability that the null hypothesis is true or that the results are due to chance alone\n - P-values represent the probability of observing a sample statistic as extreme as the one observed, assuming the null hypothesis is true\n- Focusing solely on statistical significance without considering the practical or clinical significance of the results\n - Small differences may be statistically significant with large sample sizes but have limited real-world impact","active":true,"order":3,"meta":{"title":"Sampling Distributions | Intro to Biostatistics Class Notes","description":"Study guides to review Sampling Distributions. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"Z9UiEYxDfIVxibiI","type":"STUDY_GUIDE","title":"3.1 Central Limit Theorem","slug":"central-limit-theorem","date":null,"keyTopics":[],"publicId":"Z9UiEYxDfIVxibiI","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["9R8Pq3H2jihHQ9iv"],"duration":7},{"id":"DH7vkKyKkP1nOHyQ","type":"STUDY_GUIDE","title":"3.2 Standard error","slug":"standard-error","date":null,"keyTopics":[],"publicId":"DH7vkKyKkP1nOHyQ","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["GrzTtq8GfY6zUIee"],"duration":6},{"id":"6OREsMCeFCfhONQ3","type":"STUDY_GUIDE","title":"3.3 Sampling distribution of the mean","slug":"sampling-distribution","date":null,"keyTopics":[],"publicId":"6OREsMCeFCfhONQ3","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["4t4Fhfo2MWur3Uh1"],"duration":5},{"id":"1G7IUwO0pJEmakGw","type":"STUDY_GUIDE","title":"3.4 Sampling distribution of the proportion","slug":"sampling-distribution-proportion","date":null,"keyTopics":[],"publicId":"1G7IUwO0pJEmakGw","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["ZRGMcoiXV7rgyaDO"],"duration":7},{"id":"xtzdxxU3t8kDuZhX","type":"STUDY_GUIDE","title":"3.5 T-distribution","slug":"t-distribution","date":null,"keyTopics":[],"publicId":"xtzdxxU3t8kDuZhX","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["axpWr5ImkEjCJbPQ"],"duration":8}],"numResources":1},{"id":"mFxTb3hBEGL2TIbD","name":"Unit 4 – Hypothesis Testing","emoji":"📚","slug":"unit-4","description":"Unit 4: Hypothesis Testing","intro":"Hypothesis testing is a crucial tool in biostatistics for drawing conclusions from data. It involves formulating null and alternative hypotheses, choosing appropriate tests, and interpreting results. This process helps researchers assess the significance of observed effects and make informed decisions based on statistical evidence.\n\nKey concepts include null and alternative hypotheses, types of errors, test statistics, and p-values. Various tests are used for different scenarios, such as comparing means or proportions. Proper interpretation of results, considering statistical and practical significance, is essential for drawing meaningful conclusions in biomedical research.","overview":"## Key Concepts\n- Null hypothesis ($H_0$) represents the default or status quo, assuming no significant effect or difference\n- Alternative hypothesis ($H_A$ or $H_1$) challenges the null hypothesis, proposing a significant effect or difference exists\n- Type I error (false positive) occurs when rejecting a true null hypothesis, with the significance level $\\alpha$ controlling its probability\n- Type II error (false negative) happens when failing to reject a false null hypothesis, with $\\beta$ denoting its probability\n - Power ($1-\\beta$) measures the probability of correctly rejecting a false null hypothesis\n- Test statistic is a value calculated from the sample data, used to determine whether to reject the null hypothesis\n- Critical value is a threshold on the test statistic that determines the rejection region for the null hypothesis\n- Rejection region is the range of test statistic values that lead to rejecting the null hypothesis\n\n## Types of Hypothesis Tests\n- One-sample tests compare a single sample mean or proportion to a hypothesized population value\n - One-sample t-test for comparing a sample mean to a population mean with unknown variance\n - One-sample z-test for comparing a sample mean to a population mean with known variance\n - One-sample proportion test for comparing a sample proportion to a population proportion\n- Two-sample tests compare means or proportions between two independent groups\n - Two-sample t-test for comparing means between two groups with unknown variances\n - Two-sample z-test for comparing means between two groups with known variances\n - Two-sample proportion test for comparing proportions between two groups\n- Paired tests compare means or proportions between two related or matched groups\n - Paired t-test for comparing means between two related groups\n - McNemar's test for comparing proportions between two related groups\n- Analysis of Variance (ANOVA) tests compare means among three or more groups\n - One-way ANOVA for comparing means among groups with one factor\n - Two-way ANOVA for comparing means among groups with two factors\n- Chi-square tests assess the association between two categorical variables\n - Chi-square test of independence for testing the association between two categorical variables\n - Chi-square goodness-of-fit test for comparing observed frequencies to expected frequencies\n\n## Steps in Hypothesis Testing\n1. State the null and alternative hypotheses clearly, specifying the parameter of interest and the direction of the alternative hypothesis (one-tailed or two-tailed)\n2. Choose an appropriate test statistic and significance level ($\\alpha$) based on the type of data and the research question\n3. Calculate the test statistic from the sample data using the appropriate formula for the selected hypothesis test\n4. Determine the critical value(s) or p-value associated with the test statistic, using the sampling distribution of the test statistic under the null hypothesis\n5. Compare the test statistic to the critical value(s) or p-value to the significance level, and decide whether to reject or fail to reject the null hypothesis\n6. Interpret the results in the context of the research question, considering the practical significance and potential limitations of the study\n7. Report the findings, including the test statistic, p-value, confidence interval (if applicable), and a clear conclusion based on the hypothesis test\n\n## Statistical Significance and p-values\n- Statistical significance indicates the likelihood of observing the sample results or more extreme results, assuming the null hypothesis is true\n- p-value is the probability of obtaining the observed sample results or more extreme results, given that the null hypothesis is true\n - A small p-value (typically < 0.05) suggests strong evidence against the null hypothesis, leading to its rejection\n - A large p-value (typically > 0.05) indicates weak evidence against the null hypothesis, leading to a failure to reject it\n- Significance level ($\\alpha$) is the predetermined probability threshold for rejecting the null hypothesis, commonly set at 0.05\n- Confidence interval provides a range of plausible values for the population parameter, with a specified level of confidence (e.g., 95%)\n - If the confidence interval does not contain the null hypothesis value, it suggests a statistically significant result\n- Multiple testing correction adjusts the significance level when conducting multiple hypothesis tests simultaneously to control the familywise error rate or false discovery rate\n\n## Common Errors and Pitfalls\n- Misinterpretation of p-values as the probability of the null hypothesis being true or the probability of the results occurring by chance alone\n- Confusing statistical significance with practical or clinical significance, as large sample sizes can lead to statistically significant but practically unimportant differences\n- Failing to check assumptions of the hypothesis test, such as normality, homogeneity of variance, or independence of observations\n- Choosing an inappropriate hypothesis test for the data type or research question, leading to invalid conclusions\n- Overinterpreting non-significant results as evidence of no effect, as a lack of statistical significance may be due to insufficient power or sample size\n- Engaging in data dredging or p-hacking, where multiple analyses are conducted until a significant result is found, without proper adjustment for multiple testing\n- Neglecting to consider potential confounding variables or alternative explanations for the observed results\n\n## Real-world Applications in Biostatistics\n- Clinical trials comparing the efficacy of a new drug to a placebo or standard treatment using hypothesis tests to assess treatment differences\n- Epidemiological studies investigating the association between risk factors and disease outcomes using hypothesis tests to identify significant relationships\n- Genetic studies testing for associations between genetic variants and phenotypic traits using hypothesis tests to detect significant genetic effects\n- Public health research evaluating the effectiveness of interventions or policies using hypothesis tests to compare outcomes between groups\n- Diagnostic test validation assessing the performance of a new diagnostic test compared to a gold standard using hypothesis tests to evaluate sensitivity and specificity\n- Survival analysis comparing survival rates between different treatment groups using hypothesis tests to detect significant differences in survival curves\n- Meta-analyses combining results from multiple studies using hypothesis tests to assess the overall effect size and heterogeneity across studies\n\n## Interpreting and Reporting Results\n- Report the null and alternative hypotheses, the chosen hypothesis test, and the significance level\n- Present the test statistic, degrees of freedom (if applicable), and the corresponding p-value\n- Interpret the p-value in the context of the research question, stating whether the null hypothesis is rejected or not rejected based on the significance level\n- Provide a confidence interval for the parameter of interest, if applicable, to indicate the precision of the estimate\n- Discuss the practical or clinical significance of the findings, considering the magnitude of the effect and its relevance to the field\n- Address potential limitations of the study, such as sample size, generalizability, or potential confounding factors\n- Suggest future research directions based on the findings and any unanswered questions or new hypotheses generated by the study\n\n## Advanced Topics and Extensions\n- Non-parametric tests, such as the Wilcoxon rank-sum test or Kruskal-Wallis test, for data that violate assumptions of parametric tests\n- Bayesian hypothesis testing, which incorporates prior information and calculates the posterior probability of the null and alternative hypotheses\n- Equivalence and non-inferiority testing, which aim to demonstrate that two treatments are similar or that a new treatment is not worse than a standard treatment by a specified margin\n- Multiple comparison procedures, such as Bonferroni correction or Tukey's HSD, to control the familywise error rate when conducting multiple pairwise comparisons\n- Multivariate hypothesis tests, such as MANOVA or Hotelling's T-squared test, for comparing means across multiple dependent variables simultaneously\n- Mixed-effects models and repeated measures designs, which account for correlated data structures and random effects in hypothesis testing\n- Sequential analysis and adaptive designs, which allow for interim analyses and modifications to the study design based on accumulating data while controlling the Type I error rate\n- Power analysis and sample size determination, which help ensure that a study has sufficient power to detect a meaningful effect size given the desired significance level and variability in the data","active":true,"order":4,"meta":{"title":"Hypothesis Testing | Intro to Biostatistics Class Notes","description":"Study guides to review Hypothesis Testing. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"sCtn0fY8EqV54NyF","type":"STUDY_GUIDE","title":"4.3 P-values","slug":"p-values","date":null,"keyTopics":[],"publicId":"sCtn0fY8EqV54NyF","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["uWXJdBN28shclHIV"],"duration":8},{"id":"e8FiRZIVBSyn3ZsQ","type":"STUDY_GUIDE","title":"4.1 Null and alternative hypotheses","slug":"null-alternative-hypotheses","date":null,"keyTopics":[],"publicId":"e8FiRZIVBSyn3ZsQ","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["MvDLIcQN3mh7Ud4Q"],"duration":9},{"id":"4HFN174Cffq1h1dl","type":"STUDY_GUIDE","title":"4.2 Type I and Type II errors","slug":"type-type-ii-errors","date":null,"keyTopics":[],"publicId":"4HFN174Cffq1h1dl","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["P1U85s6YphZ13ETH"],"duration":7},{"id":"p51GnPotzpReA1YZ","type":"STUDY_GUIDE","title":"4.4 Statistical power","slug":"statistical-power","date":null,"keyTopics":[],"publicId":"p51GnPotzpReA1YZ","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["HNl4Akm74aYCoQvr"],"duration":7},{"id":"OMTloyn2JzB4y9sj","type":"STUDY_GUIDE","title":"4.5 One-sample tests","slug":"one-sample-tests","date":null,"keyTopics":[],"publicId":"OMTloyn2JzB4y9sj","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["w0xDHrIyEzAn1fa8"],"duration":10},{"id":"CCihz0QBeycja5xI","type":"STUDY_GUIDE","title":"4.6 Two-sample tests","slug":"two-sample-tests","date":null,"keyTopics":[],"publicId":"CCihz0QBeycja5xI","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["YRlXP1A3KAyTVAKa"],"duration":8}],"numResources":1},{"id":"5HqBzzlN71HHuIbr","name":"Unit 5 – Confidence Intervals in Biostatistics","emoji":"📚","slug":"unit-5","description":"Unit 5: Confidence Intervals","intro":"Confidence intervals are a crucial tool in biostatistics, providing a range of plausible values for population parameters based on sample data. They help quantify uncertainty in estimates, allowing researchers to draw meaningful conclusions from studies and compare different groups or treatments.\n\nUnderstanding confidence intervals is essential for interpreting research findings, designing studies, and conducting meta-analyses. This topic covers key concepts, calculation methods, interpretation guidelines, and applications in biomedical research, as well as common pitfalls and advanced extensions of the technique.","overview":"## Key Concepts and Definitions\n- Confidence intervals provide a range of plausible values for an unknown population parameter based on sample data\n- Confidence level represents the proportion of intervals that would contain the true population parameter if the sampling process were repeated many times\n- Standard error measures the variability of a statistic, such as the sample mean, across different samples\n- Margin of error determines the width of the confidence interval and is calculated using the standard error and a critical value from the appropriate distribution (e.g., t-distribution or z-distribution)\n- Point estimate is a single value, such as the sample mean, used to estimate the population parameter\n- Sampling distribution describes the distribution of a statistic across many samples from the same population\n- Central Limit Theorem states that the sampling distribution of the mean approaches a normal distribution as the sample size increases, regardless of the shape of the population distribution\n\n## Importance in Biostatistics\n- Confidence intervals help quantify the uncertainty associated with point estimates, providing a more informative summary of the data\n- They allow researchers to draw conclusions about population parameters based on sample data, which is crucial in biomedical research where studying entire populations is often impractical\n- Confidence intervals can be used to compare different groups or treatments, helping to determine if observed differences are statistically significant\n- They provide a way to assess the precision of estimates, with narrower intervals indicating more precise estimates\n- Confidence intervals are essential for sample size calculations and power analysis in study design\n- They facilitate meta-analysis by allowing researchers to combine results from multiple studies and estimate the overall effect size\n- Confidence intervals are widely reported in biomedical literature, making their understanding crucial for interpreting research findings\n\n## Types of Confidence Intervals\n- Two-sided confidence intervals provide a range of values that is likely to contain the true population parameter, with equal probability of the parameter being above or below the interval\n- One-sided confidence intervals provide a range of values that is likely to contain the true population parameter, with the parameter being either above or below the interval, depending on the direction of interest\n- Confidence intervals for means are used when the population parameter of interest is the mean of a continuous variable\n - They can be calculated using the t-distribution when the population standard deviation is unknown and the sample size is small (typically < 30)\n - They can be calculated using the z-distribution when the population standard deviation is known or the sample size is large (typically ≥ 30)\n- Confidence intervals for proportions are used when the population parameter of interest is a proportion or percentage\n- Confidence intervals for differences between means or proportions are used to compare two groups or treatments\n- Confidence intervals for ratios, such as relative risks or odds ratios, are used to assess the strength of association between two variables\n\n## Calculating Confidence Intervals\n- The general formula for a confidence interval is: point estimate ± (critical value × standard error)\n- For means, the point estimate is the sample mean ($\\bar{x}$), and the standard error is calculated as $s/\\sqrt{n}$, where $s$ is the sample standard deviation and $n$ is the sample size\n- The critical value depends on the desired confidence level and the appropriate distribution (e.g., t-distribution or z-distribution)\n - For a 95% confidence interval using the t-distribution, the critical value is denoted as $t_{\\alpha/2, n-1}$, where $\\alpha = 1 - \\text{confidence level}$ and $n-1$ is the degrees of freedom\n - For a 95% confidence interval using the z-distribution, the critical value is approximately 1.96\n- For proportions, the point estimate is the sample proportion ($\\hat{p}$), and the standard error is calculated as $\\sqrt{\\hat{p}(1-\\hat{p})/n}$\n- When calculating confidence intervals for differences or ratios, the standard error formula must be adjusted to account for the variability in both groups or variables\n\n## Interpreting Confidence Intervals\n- A confidence interval that does not contain the null value (e.g., 0 for differences, 1 for ratios) suggests a statistically significant result at the corresponding confidence level\n- Wider confidence intervals indicate less precise estimates and more uncertainty, while narrower intervals indicate more precise estimates and less uncertainty\n- Confidence intervals provide information about the magnitude and direction of an effect, not just its statistical significance\n- When comparing two confidence intervals, if they do not overlap, it suggests a statistically significant difference between the groups or treatments\n- Overlapping confidence intervals do not necessarily imply a lack of statistical significance, as the degree of overlap and the significance level must be considered\n- Confidence intervals should be interpreted in the context of the research question, study design, and other relevant factors, such as clinical significance and practical implications\n\n## Applications in Biomedical Research\n- Confidence intervals are commonly reported for measures of central tendency (e.g., means) and variability (e.g., standard deviations) to summarize continuous variables\n- They are used to estimate the prevalence or incidence of diseases or conditions in a population based on sample data\n- Confidence intervals are employed to assess the effectiveness of interventions, such as drugs or therapies, by comparing outcomes between treatment and control groups\n- They are used to evaluate the accuracy of diagnostic tests by estimating sensitivity, specificity, and predictive values\n- Confidence intervals are reported for measures of association, such as relative risks, odds ratios, and correlation coefficients, to assess the strength and direction of relationships between variables\n- They are used in meta-analyses to combine results from multiple studies and estimate the overall effect size, taking into account the variability across studies\n- Confidence intervals are considered in sample size calculations and power analysis to ensure that studies are adequately powered to detect meaningful differences or associations\n\n## Common Mistakes and Pitfalls\n- Misinterpreting a confidence interval as a range that contains 95% (or another confidence level) of the data, rather than a range that has a 95% probability of containing the true population parameter\n- Failing to consider the width of the confidence interval when interpreting the precision of the estimate\n- Assuming that non-overlapping confidence intervals always indicate a statistically significant difference, without considering the significance level and the extent of the overlap\n- Interpreting a confidence interval that includes the null value as evidence of no effect, rather than as insufficient evidence to reject the null hypothesis\n- Comparing confidence intervals across studies with different sample sizes, variability, or methods without considering these factors\n- Focusing solely on statistical significance based on confidence intervals, while neglecting the practical or clinical significance of the results\n- Failing to report the confidence level and the methods used to calculate the confidence intervals in research papers, which limits the interpretability and reproducibility of the findings\n\n## Advanced Topics and Extensions\n- Confidence intervals for medians and other percentiles can be calculated using non-parametric methods, such as the binomial method or the bootstrap method\n- Simultaneous confidence intervals are used when making multiple comparisons, such as in analysis of variance (ANOVA) or multiple regression, to control the overall Type I error rate\n- Confidence bands are used to provide a visual representation of the uncertainty around an estimated curve, such as a regression line or a survival curve\n- Bayesian credible intervals are an alternative to frequentist confidence intervals and incorporate prior information about the parameter of interest\n- Confidence intervals can be constructed for complex sampling designs, such as stratified or clustered sampling, using appropriate variance estimation methods\n- Confidence intervals for functions of parameters, such as ratios or products of means or proportions, can be calculated using the delta method or the bootstrap method\n- Confidence intervals for dependent or correlated data, such as in repeated measures designs or clustered data, require specialized methods that account for the correlation structure","active":true,"order":5,"meta":{"title":"Confidence Intervals in Biostatistics | Intro to Biostatistics Class Notes","description":"Study guides to review Confidence Intervals in Biostatistics. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"yzoYEpYaMr6OybfI","type":"STUDY_GUIDE","title":"5.3 Confidence interval for the difference between means","slug":"confidence-interval-difference-means","date":null,"keyTopics":[],"publicId":"yzoYEpYaMr6OybfI","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["5QOChUBA8te8X1gJ"],"duration":7},{"id":"cn0l264Li1Lhmo6l","type":"STUDY_GUIDE","title":"5.1 Confidence interval for the mean","slug":"confidence-interval","date":null,"keyTopics":[],"publicId":"cn0l264Li1Lhmo6l","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["o2gekTXfUmAhGxdl"],"duration":7},{"id":"L5iQST0TYsoVUt4Y","type":"STUDY_GUIDE","title":"5.2 Confidence interval for the proportion","slug":"confidence-interval-proportion","date":null,"keyTopics":[],"publicId":"L5iQST0TYsoVUt4Y","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["BMBENOhWf0SxoLhA"],"duration":6},{"id":"t3SOYhz7VsHCNY7a","type":"STUDY_GUIDE","title":"5.4 Confidence interval for the difference between proportions","slug":"confidence-interval-difference-proportions","date":null,"keyTopics":[],"publicId":"t3SOYhz7VsHCNY7a","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["JPq4kciEhl7jdUMZ"],"duration":5},{"id":"beytRL0iOZSeEzPK","type":"STUDY_GUIDE","title":"5.5 Interpreting confidence intervals","slug":"interpreting-confidence-intervals","date":null,"keyTopics":[],"publicId":"beytRL0iOZSeEzPK","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["00kGvcqqYi2f0gTd"],"duration":6}],"numResources":1},{"id":"lrOzUV5QUEYM8Oz8","name":"Unit 6 – Regression Analysis","emoji":"📚","slug":"unit-6","description":"Unit 6: Regression Analysis","intro":"Regression analysis is a powerful statistical tool used to examine relationships between variables. It helps predict outcomes, estimate the strength of associations, and infer potential causal connections. This method is widely applied in biostatistics, economics, and social sciences.\n\nVarious types of regression models exist, including linear, logistic, and polynomial regression. Key concepts include dependent and independent variables, coefficients, residuals, and R-squared. Building a regression model involves defining research questions, collecting data, selecting appropriate models, and interpreting results.","overview":"## What's Regression Analysis?\n- Statistical method used to examine the relationship between a dependent variable and one or more independent variables\n- Helps predict the value of the dependent variable based on the values of the independent variables\n- Estimates the strength and direction of the relationship between variables\n- Useful for understanding how changes in independent variables affect the dependent variable\n- Can be used for prediction, forecasting, and inferring causal relationships (with caution)\n- Widely applied in various fields, including biostatistics, economics, and social sciences\n- Provides a quantitative measure of the impact of each independent variable on the dependent variable\n\n## Types of Regression Models\n- Linear regression\n - Assumes a linear relationship between the dependent and independent variables\n - Simple linear regression involves one independent variable\n - Multiple linear regression involves two or more independent variables\n- Logistic regression\n - Used when the dependent variable is binary or categorical (e.g., presence or absence of a disease)\n - Estimates the probability of an event occurring based on the independent variables\n- Polynomial regression\n - Models non-linear relationships between the dependent and independent variables\n - Includes higher-order terms (squared, cubed, etc.) of the independent variables\n- Stepwise regression\n - Iterative process of adding or removing independent variables based on their statistical significance\n - Helps identify the most relevant variables for the model\n- Ridge regression and Lasso regression\n - Used to handle multicollinearity (high correlation) among independent variables\n - Apply regularization techniques to shrink the coefficients of less important variables\n\n## Key Concepts and Terminology\n- Dependent variable (response variable)\n - The variable being predicted or explained by the model\n - Usually denoted as Y\n- Independent variables (predictor variables, explanatory variables)\n - The variables used to predict or explain the dependent variable\n - Usually denoted as X1, X2, etc.\n- Coefficients (parameters)\n - Numerical values that represent the change in the dependent variable for a one-unit change in the corresponding independent variable, holding other variables constant\n - Denoted as β0 (intercept), β1, β2, etc.\n- Residuals\n - The differences between the observed values of the dependent variable and the predicted values from the regression model\n- R-squared (coefficient of determination)\n - Measures the proportion of variance in the dependent variable that is explained by the independent variables\n - Ranges from 0 to 1, with higher values indicating a better fit of the model to the data\n- P-value\n - Indicates the statistical significance of the relationship between an independent variable and the dependent variable\n - A small p-value (typically < 0.05) suggests that the relationship is unlikely to have occurred by chance\n\n## Building a Regression Model\n- Define the research question and identify the dependent and independent variables\n- Collect and preprocess the data\n - Clean the data by handling missing values, outliers, and inconsistencies\n - Transform variables if necessary (e.g., log transformation for skewed data)\n- Explore the data using descriptive statistics and visualizations\n - Examine the distribution of variables and their relationships\n - Check for potential multicollinearity among independent variables\n- Select the appropriate regression model based on the nature of the dependent variable and the relationships observed in the data\n- Estimate the model coefficients using a fitting method (e.g., least squares, maximum likelihood)\n- Assess the model's goodness of fit and performance\n - Evaluate R-squared, adjusted R-squared, and other fit statistics\n - Check the significance of the coefficients using p-values and confidence intervals\n- Validate the model using techniques such as cross-validation or holdout samples\n- Refine the model if necessary by adding or removing variables, transforming variables, or considering interaction terms\n\n## Interpreting Regression Results\n- Coefficient estimates\n - Represent the change in the dependent variable for a one-unit change in the corresponding independent variable, holding other variables constant\n - The sign of the coefficient indicates the direction of the relationship (positive or negative)\n- Standard errors\n - Measure the precision of the coefficient estimates\n - Smaller standard errors indicate more precise estimates\n- P-values and confidence intervals\n - Assess the statistical significance of the coefficients\n - A small p-value (typically < 0.05) and a confidence interval not containing zero suggest a significant relationship\n- Residual analysis\n - Examine the distribution of residuals to check for model assumptions (e.g., normality, homoscedasticity)\n - Identify potential outliers or influential observations\n- Practical significance\n - Consider the practical implications of the coefficient estimates\n - Assess whether the magnitude of the effects is meaningful in the context of the problem\n\n## Assumptions and Diagnostics\n- Linearity\n - The relationship between the dependent variable and independent variables should be linear\n - Can be assessed using residual plots or by adding non-linear terms to the model\n- Independence\n - The observations should be independent of each other\n - Violations can occur with time series data or clustered data\n- Normality\n - The residuals should be normally distributed\n - Can be assessed using histograms, Q-Q plots, or statistical tests (e.g., Shapiro-Wilk test)\n- Homoscedasticity\n - The variance of the residuals should be constant across all levels of the independent variables\n - Can be assessed using residual plots or statistical tests (e.g., Breusch-Pagan test)\n- No multicollinearity\n - The independent variables should not be highly correlated with each other\n - Can be assessed using correlation matrices or variance inflation factors (VIF)\n- Influential observations and outliers\n - Identify observations that have a disproportionate impact on the model\n - Can be assessed using leverage values, Cook's distance, or residual plots\n\n## Applications in Biostatistics\n- Epidemiology\n - Identifying risk factors for diseases\n - Estimating the strength of associations between exposures and health outcomes\n- Clinical trials\n - Evaluating the effectiveness of treatments or interventions\n - Adjusting for confounding variables to isolate the treatment effect\n- Genetics and genomics\n - Associating genetic variants with phenotypic traits or diseases\n - Predicting disease risk based on genetic profiles\n- Environmental health\n - Assessing the impact of environmental exposures on health outcomes\n - Identifying environmental risk factors for diseases\n- Health services research\n - Analyzing factors associated with healthcare utilization and costs\n - Predicting patient outcomes based on demographic and clinical characteristics\n\n## Common Pitfalls and How to Avoid Them\n- Overfitting\n - Occurs when the model is too complex and fits the noise in the data rather than the underlying patterns\n - Can be avoided by using model selection techniques (e.g., stepwise regression, regularization) and validating the model on independent data\n- Underfitting\n - Occurs when the model is too simple and fails to capture important relationships in the data\n - Can be avoided by considering a wider range of variables and non-linear relationships\n- Extrapolation\n - Applying the model to predict outcomes outside the range of the observed data\n - Can lead to unreliable predictions and should be done with caution\n- Confounding\n - Occurs when an unmeasured variable influences both the dependent and independent variables, leading to spurious associations\n - Can be addressed by carefully selecting variables, using randomization in experiments, or applying statistical techniques (e.g., propensity score matching)\n- Misinterpretation of coefficients\n - Interpreting coefficients without considering the scale and units of the variables\n - Can be avoided by carefully examining the units and scale of the variables and interpreting the coefficients in the appropriate context\n- Ignoring model assumptions\n - Failing to check and address violations of model assumptions\n - Can lead to biased and unreliable results\n - Should be addressed by assessing assumptions using diagnostic tools and applying appropriate remedial measures (e.g., transformations, robust standard errors)","active":true,"order":6,"meta":{"title":"Regression Analysis | Intro to Biostatistics Class Notes","description":"Study guides to review Regression Analysis. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"9STvcq6HC9ZMpi1B","type":"STUDY_GUIDE","title":"6.3 Logistic regression","slug":"logistic-regression","date":null,"keyTopics":[],"publicId":"9STvcq6HC9ZMpi1B","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["I4PJqudPvmFta7Ze"],"duration":7},{"id":"mkiUjWxLZuIXeFcb","type":"STUDY_GUIDE","title":"6.1 Simple linear regression","slug":"simple-linear-regression","date":null,"keyTopics":[],"publicId":"mkiUjWxLZuIXeFcb","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["vOo7zjNRS9XIXhpi"],"duration":5},{"id":"dalNtc1LwM1X1VfS","type":"STUDY_GUIDE","title":"6.2 Multiple linear regression","slug":"multiple-linear-regression","date":null,"keyTopics":[],"publicId":"dalNtc1LwM1X1VfS","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["lgFyyPomV0FA6yqE"],"duration":8},{"id":"ibO4uUwy3WwUuSjw","type":"STUDY_GUIDE","title":"6.4 Model diagnostics","slug":"model-diagnostics","date":null,"keyTopics":[],"publicId":"ibO4uUwy3WwUuSjw","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["cPBH2WsHuJfH5BXK"],"duration":8},{"id":"wFuxzd24hwhbAiXv","type":"STUDY_GUIDE","title":"6.5 Correlation analysis","slug":"correlation-analysis","date":null,"keyTopics":[],"publicId":"wFuxzd24hwhbAiXv","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["80oIy4v9pmwnXKE2"],"duration":8}],"numResources":1},{"id":"pmM6iTkJlTrbCJhp","name":"Unit 7 – Analysis of Variance (ANOVA)","emoji":"📚","slug":"unit-7","description":"Unit 7: Analysis of Variance (ANOVA)","intro":"Analysis of Variance (ANOVA) is a powerful statistical tool used to compare means across multiple groups. It extends the t-test concept, allowing researchers to analyze complex datasets with multiple factors and levels, making it invaluable in biostatistics and medical research.\n\nANOVA helps determine significant differences between group means, providing a framework for understanding variation sources. By enabling efficient analysis of experimental results and observational studies, ANOVA empowers researchers to draw meaningful conclusions and make evidence-based recommendations in healthcare settings.","overview":"## What's ANOVA and Why Should I Care?\n- ANOVA stands for Analysis of Variance, a statistical method used to compare means across multiple groups simultaneously\n- Determines if there are significant differences between the means of three or more independent groups\n- Extends the concepts of the t-test, which can only compare two groups at a time\n- Helps researchers and clinicians make informed decisions based on data-driven evidence\n- Widely used in various fields, including biostatistics, to analyze experimental results and observational studies\n- Allows for the efficient analysis of complex datasets with multiple factors and levels\n- Provides a framework for understanding the sources of variation within and between groups\n- Enables researchers to draw meaningful conclusions and make evidence-based recommendations in healthcare and medical research\n\n## Key Concepts and Terminology\n- Factors are the independent variables in an ANOVA, each with two or more levels (e.g., treatment groups, age categories)\n- Levels represent the different categories or values within a factor (e.g., placebo, low dose, high dose)\n- Response variable is the dependent variable, the outcome being measured (e.g., blood pressure, tumor size)\n- Grand mean is the overall mean of the response variable across all groups\n- Group means are the means of the response variable for each specific group or treatment level\n- Sum of squares (SS) measures the variability in the data, divided into SS between groups and SS within groups\n - SS between groups quantifies the variability between the group means and the grand mean\n - SS within groups quantifies the variability of the observations within each group\n- Degrees of freedom (df) represent the number of independent pieces of information used to calculate the statistic\n - df between groups equals the number of groups minus one\n - df within groups equals the total sample size minus the number of groups\n- Mean square (MS) is calculated by dividing the sum of squares by the corresponding degrees of freedom\n - MS between groups is the SS between groups divided by the df between groups\n - MS within groups is the SS within groups divided by the df within groups\n\n## Types of ANOVA: One-Way, Two-Way, and Beyond\n- One-way ANOVA compares means across levels of a single factor (e.g., comparing test scores across different teaching methods)\n- Two-way ANOVA examines the effects of two factors on the response variable, as well as their interaction (e.g., analyzing the impact of both medication and therapy on patient outcomes)\n - Main effects represent the influence of each factor on the response variable, ignoring the other factor\n - Interaction effect occurs when the impact of one factor depends on the level of the other factor\n- Three-way ANOVA extends the analysis to include three factors and their interactions (e.g., investigating the effects of age, gender, and treatment on disease progression)\n- Repeated measures ANOVA is used when the same subjects are measured under different conditions or at multiple time points (e.g., assessing the effectiveness of a weight loss program over time)\n- Multivariate ANOVA (MANOVA) is employed when there are multiple related response variables (e.g., evaluating the impact of a drug on both systolic and diastolic blood pressure)\n- Mixed-effects ANOVA incorporates both fixed and random factors, allowing for the generalization of findings beyond the specific levels included in the study\n\n## Setting Up Your ANOVA: Hypotheses and Assumptions\n- Null hypothesis (H0) states that there is no significant difference between the group means (e.g., H0: μ1 = μ2 = μ3)\n- Alternative hypothesis (Ha) proposes that at least one group mean differs significantly from the others (e.g., Ha: at least one μi ≠ μj)\n- Independence assumption requires that observations within and between groups are independent of each other\n - Randomly assign subjects to treatment groups to ensure independence\n - Avoid repeated measurements on the same individuals, unless using a repeated measures ANOVA\n- Normality assumption states that the response variable should be approximately normally distributed within each group\n - Assess normality using visual methods (e.g., histograms, Q-Q plots) or statistical tests (e.g., Shapiro-Wilk test)\n - ANOVA is generally robust to moderate violations of normality, especially with large and equal sample sizes\n- Homogeneity of variance assumption requires that the population variances of the response variable are equal across all groups\n - Evaluate this assumption using Levene's test or by comparing the largest and smallest group variances\n - If violated, consider transforming the data or using a non-parametric alternative (e.g., Kruskal-Wallis test)\n\n## Crunching the Numbers: F-statistic and p-values\n- The F-statistic is the ratio of the between-group variability to the within-group variability, calculated as:\n - $F = \\frac{MS \\text{ between groups}}{MS \\text{ within groups}}$\n- A large F-statistic indicates that the between-group variability is much larger than the within-group variability, suggesting significant differences between group means\n- The p-value associated with the F-statistic represents the probability of observing such an extreme F-statistic, assuming the null hypothesis is true\n - A small p-value (typically < 0.05) provides evidence against the null hypothesis, indicating significant differences between group means\n - A large p-value (> 0.05) suggests insufficient evidence to reject the null hypothesis, implying no significant differences between group means\n- The critical F-value is determined by the significance level (α), the degrees of freedom for the numerator (df between groups), and the degrees of freedom for the denominator (df within groups)\n - If the observed F-statistic exceeds the critical F-value, reject the null hypothesis\n- Effect size measures, such as eta-squared (η²) or omega-squared (ω²), quantify the magnitude of the differences between groups\n - Eta-squared: $\\eta^2 = \\frac{SS \\text{ between groups}}{SS \\text{ total}}$\n - Omega-squared: $\\omega^2 = \\frac{SS \\text{ between groups} - (df \\text{ between groups}) \\times MS \\text{ within groups}}{SS \\text{ total} + MS \\text{ within groups}}$\n\n## Interpreting ANOVA Results: What Do They Actually Mean?\n- A significant F-test indicates that at least one group mean differs significantly from the others, but it does not specify which group(s) differ\n- Post-hoc tests, such as Tukey's HSD or Bonferroni correction, are used to make pairwise comparisons between group means while controlling for the familywise error rate\n - Tukey's HSD test is more powerful and widely used when sample sizes are equal\n - Bonferroni correction is more conservative and can be used with unequal sample sizes\n- Confidence intervals for the group means and their differences provide a range of plausible values for the true population parameters\n- The practical significance of the results should be considered alongside the statistical significance\n - A statistically significant result may not be practically meaningful if the effect size is small or the differences between groups are not clinically relevant\n- Non-significant results should be interpreted cautiously, as they may be due to insufficient sample size (low power) or high variability within groups\n- Reporting ANOVA results should include the F-statistic, degrees of freedom, p-value, effect size, and post-hoc comparisons (if applicable)\n\n## Real-World Applications in Biostatistics\n- Comparing the effectiveness of different treatments or interventions on patient outcomes (e.g., evaluating the impact of various medications on blood glucose levels in patients with diabetes)\n- Assessing the influence of risk factors on disease progression or severity (e.g., investigating the effects of age, gender, and smoking status on lung function in patients with COPD)\n- Evaluating the performance of diagnostic tests across different patient subgroups (e.g., comparing the sensitivity and specificity of a new cancer screening test in different age and risk categories)\n- Analyzing the impact of environmental factors on public health outcomes (e.g., examining the relationship between air pollution levels and respiratory hospital admissions in different cities)\n- Investigating the effects of genetic variations on treatment response or disease susceptibility (e.g., assessing the influence of specific gene polymorphisms on the efficacy and safety of a drug)\n- Comparing patient-reported outcomes across different healthcare settings or providers (e.g., evaluating patient satisfaction scores in various hospital departments or clinics)\n- Assessing the effectiveness of public health interventions or policies (e.g., comparing vaccination rates or disease incidence before and after implementing a new immunization program)\n\n## Common Pitfalls and How to Avoid Them\n- Failing to check and address violations of ANOVA assumptions\n - Always assess the assumptions of independence, normality, and homogeneity of variance\n - Consider alternative methods (e.g., non-parametric tests, data transformations) if assumptions are severely violated\n- Misinterpreting non-significant results as evidence of no difference between groups\n - Non-significant results may be due to insufficient sample size or high variability within groups\n - Report confidence intervals and effect sizes to provide a more complete picture of the results\n- Conducting multiple pairwise comparisons without adjusting for the familywise error rate\n - Use appropriate post-hoc tests (e.g., Tukey's HSD, Bonferroni correction) to control for the increased risk of Type I errors when making multiple comparisons\n- Overinterpreting statistically significant results without considering practical significance\n - Evaluate the magnitude of the differences between groups and their clinical or practical relevance\n - Report effect sizes and confidence intervals to help contextualize the findings\n- Ignoring the potential impact of outliers or influential observations on the results\n - Inspect the data for extreme values or unusual observations that may disproportionately affect the analysis\n - Consider sensitivity analyses (e.g., removing outliers, using robust methods) to assess the robustness of the findings\n- Failing to report all relevant information when presenting ANOVA results\n - Include the F-statistic, degrees of freedom, p-value, effect size, and post-hoc comparisons (if applicable)\n - Provide a clear description of the factors, levels, and response variable, along with the sample sizes for each group\n- Overgeneralizing the findings beyond the scope of the study\n - Be cautious when extrapolating the results to populations or settings not represented in the sample\n - Clearly state the limitations and potential sources of bias in the study design and analysis","active":true,"order":7,"meta":{"title":"Analysis of Variance (ANOVA) | Intro to Biostatistics Class Notes","description":"Study guides to review Analysis of Variance (ANOVA). For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"YOlFtb7aiCZsGamU","type":"STUDY_GUIDE","title":"7.5 Assumptions and diagnostics","slug":"assumptions-diagnostics","date":null,"keyTopics":[],"publicId":"YOlFtb7aiCZsGamU","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["IqI96EA6hy9DrQy0"],"duration":7},{"id":"60h4kp5pU0UzVNz7","type":"STUDY_GUIDE","title":"7.2 Two-way ANOVA","slug":"two-way-anova","date":null,"keyTopics":[],"publicId":"60h4kp5pU0UzVNz7","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["17DnWSP6DHHrT1Mw"],"duration":9},{"id":"B6Y0sj2UQAFyRXEV","type":"STUDY_GUIDE","title":"7.3 Repeated measures ANOVA","slug":"repeated-measures-anova","date":null,"keyTopics":[],"publicId":"B6Y0sj2UQAFyRXEV","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["TYm9VKdPDQ0D8dEm"],"duration":5},{"id":"igNe9jDwwC8mMu8r","type":"STUDY_GUIDE","title":"7.4 Post-hoc tests","slug":"post-hoc-tests","date":null,"keyTopics":[],"publicId":"igNe9jDwwC8mMu8r","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["2olC7NUpw5LqrLg5"],"duration":8},{"id":"RfAD16ZrOUhh1kqw","type":"STUDY_GUIDE","title":"7.1 One-way ANOVA","slug":"one-way-anova","date":null,"keyTopics":[],"publicId":"RfAD16ZrOUhh1kqw","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["qICrNtF0uKITCGhv"],"duration":5}],"numResources":1},{"id":"53U9MGSYxB66PLbg","name":"Unit 8 – Experimental Design","emoji":"📚","slug":"unit-8","description":"Unit 8: Experimental Design","intro":"Experimental design is the backbone of scientific research, providing a structured approach to testing hypotheses and drawing valid conclusions. This unit covers key concepts like independent and dependent variables, control groups, and randomization, essential for conducting rigorous studies.\n\nThe unit also explores various types of experimental designs, sampling methods, and data collection strategies. It delves into statistical analysis approaches and ethical considerations, providing a comprehensive overview of the tools and principles used in biostatistical research.","overview":"## Key Concepts\n- Experimental design involves planning and conducting a study to test a hypothesis while controlling for potential confounding variables\n- Independent variable represents the factor being manipulated or changed by the researcher to observe its effect on the dependent variable\n- Dependent variable measures the outcome or response that is being studied and is expected to change based on the manipulation of the independent variable\n- Control group serves as a baseline for comparison, receiving no treatment or a standard treatment, while the experimental group receives the intervention being tested\n- Confounding variables are extraneous factors that can influence the relationship between the independent and dependent variables, potentially leading to biased results if not properly controlled\n- Randomization assigns subjects to different treatment groups by chance, ensuring that any differences between groups are due to the intervention rather than pre-existing differences\n- Blinding conceals the treatment allocation from participants, researchers, or both (double-blinding) to minimize bias and placebo effects\n- Statistical significance indicates whether the observed differences between groups are likely due to chance or the intervention, typically set at a p-value threshold of 0.05\n\n## Types of Experimental Designs\n- Completely randomized design assigns subjects to treatment groups purely by chance, ensuring each subject has an equal probability of being in any group\n- Randomized block design divides subjects into homogeneous subgroups (blocks) based on a specific characteristic before randomly assigning treatments within each block\n- Factorial design investigates the effects of two or more independent variables simultaneously, allowing for the examination of main effects and interactions between variables\n- Crossover design exposes each subject to all treatments in a random order, with a washout period between treatments to minimize carryover effects\n- Matched pairs design pairs subjects with similar characteristics and randomly assigns one member of each pair to each treatment group\n- Repeated measures design involves exposing each subject to all treatments over time, allowing for within-subject comparisons and reducing the influence of individual differences\n- Latin square design arranges treatments in a grid to control for two sources of variability (rows and columns) while ensuring each treatment appears only once in each row and column\n- Split-plot design combines elements of completely randomized and randomized block designs, with some factors randomized to larger plots and others to smaller subplots within each plot\n\n## Variables and Controls\n- Independent variable is the factor manipulated by the researcher to observe its effect on the dependent variable (e.g., drug dosage, teaching method)\n- Dependent variable is the outcome measured in response to changes in the independent variable (e.g., blood pressure, test scores)\n - Continuous dependent variables can take on any value within a range (e.g., weight, time)\n - Categorical dependent variables have distinct categories or levels (e.g., gender, disease status)\n- Confounding variables are extraneous factors that can influence the relationship between the independent and dependent variables, leading to biased results if not controlled\n - Examples of confounding variables include age, gender, socioeconomic status, and pre-existing health conditions\n- Control variables are kept constant throughout the experiment to minimize their influence on the dependent variable (e.g., room temperature, lighting conditions)\n- Placebo control involves giving a treatment that appears identical to the active intervention but has no known effect, to account for psychological effects and biases\n- Positive control is a treatment known to produce the desired effect, used to validate the experimental setup and ensure the study can detect a true effect\n- Negative control is a treatment known to have no effect, used to establish a baseline and detect any confounding factors or biases in the experimental setup\n\n## Sampling Methods\n- Simple random sampling selects subjects from a population purely by chance, giving each individual an equal probability of being chosen\n- Stratified random sampling divides the population into homogeneous subgroups (strata) based on a specific characteristic, then randomly samples from each stratum in proportion to its size\n- Cluster sampling involves dividing the population into naturally occurring groups (clusters), randomly selecting a subset of clusters, and including all individuals within those clusters in the sample\n- Systematic sampling selects subjects from a population at regular intervals (e.g., every 10th individual) after randomly choosing a starting point\n- Convenience sampling selects subjects based on their availability and willingness to participate, often used in pilot studies or when random sampling is not feasible\n- Purposive sampling selects subjects based on specific characteristics or criteria relevant to the research question, ensuring the sample is representative of the population of interest\n- Snowball sampling relies on initial subjects to recruit additional participants from their social networks, useful for studying hard-to-reach or hidden populations\n- Quota sampling sets a predetermined number of subjects to be selected from each subgroup within the population, ensuring the sample reflects the population's composition\n\n## Randomization Techniques\n- Simple randomization assigns subjects to treatment groups purely by chance, using methods like flipping a coin, drawing names from a hat, or using a random number generator\n- Block randomization divides subjects into smaller, homogeneous subgroups (blocks) before randomly assigning treatments within each block to ensure balanced representation of key characteristics\n- Stratified randomization combines stratified sampling and randomization, dividing the population into strata based on specific characteristics, then randomly assigning treatments within each stratum\n- Covariate adaptive randomization uses information about subjects' baseline characteristics to assign treatments in a way that minimizes imbalances between groups\n- Permuted block randomization creates blocks of a fixed size (e.g., 4 or 6) and randomly assigns treatments within each block, ensuring balanced treatment allocation throughout the study\n- Biased coin randomization adjusts the probability of assignment to each treatment group based on the number of subjects already assigned, to maintain balance between groups\n- Urn randomization involves drawing colored balls from an urn, with each color representing a treatment group, and replacing the drawn ball with one or more balls of the opposite color to maintain balance\n- Minimization assigns subjects to treatment groups based on specific baseline characteristics, aiming to minimize the overall imbalance between groups across all factors\n\n## Data Collection Strategies\n- Surveys and questionnaires gather self-reported data from subjects using a standardized set of questions, which can be administered in person, by mail, phone, or online\n- Interviews involve a researcher asking subjects open-ended or structured questions to collect detailed, qualitative data on their experiences, opinions, or behaviors\n- Observations involve researchers directly watching and recording subjects' behaviors or events of interest, either in a natural setting or a controlled laboratory environment\n- Physiological measurements collect data on subjects' bodily functions and processes, such as heart rate, blood pressure, or brain activity, using specialized equipment (e.g., ECG, EEG)\n- Behavioral tests assess subjects' performance on specific tasks or activities, such as cognitive tests, physical assessments, or simulated scenarios\n- Archival data involves using existing records or datasets, such as medical records, government statistics, or historical documents, to answer research questions\n- Ecological momentary assessment (EMA) collects real-time data on subjects' experiences, behaviors, and moods in their natural environment using methods like diaries or mobile apps\n- Biomarkers are measurable indicators of biological processes, such as blood tests or genetic analyses, used to assess health status, disease risk, or treatment response\n\n## Statistical Analysis Approaches\n- Descriptive statistics summarize and describe the main features of a dataset, such as measures of central tendency (mean, median, mode) and dispersion (range, standard deviation)\n- Inferential statistics use sample data to make generalizations or predictions about a larger population, testing hypotheses and estimating parameters\n- t-tests compare the means of two groups to determine if they are significantly different, assuming the data follows a normal distribution and the groups have equal variances\n- Analysis of variance (ANOVA) tests for differences in means among three or more groups, partitioning the total variation into between-group and within-group components\n- Correlation analysis assesses the strength and direction of the linear relationship between two continuous variables, using measures like Pearson's or Spearman's correlation coefficients\n- Regression analysis models the relationship between a dependent variable and one or more independent variables, allowing for prediction and estimation of the dependent variable's value\n- Chi-square tests examine the association between two categorical variables, comparing observed frequencies to expected frequencies under the null hypothesis of independence\n- Non-parametric tests, such as Mann-Whitney U, Wilcoxon signed-rank, or Kruskal-Wallis, analyze data that does not follow a normal distribution or has ordinal or ranked variables\n\n## Ethical Considerations\n- Informed consent ensures that subjects understand the purpose, procedures, risks, and benefits of the study and voluntarily agree to participate\n- Confidentiality protects subjects' personal information and data from unauthorized access or disclosure, using methods like anonymization or secure storage\n- Beneficence requires researchers to maximize the potential benefits of the study while minimizing any harm or risks to subjects\n- Justice ensures that the benefits and burdens of research are distributed fairly among different groups, avoiding exploitation or discrimination\n- Respect for persons recognizes subjects' autonomy and right to make their own decisions, providing them with sufficient information and the freedom to withdraw from the study at any time\n- Scientific integrity involves conducting research honestly, objectively, and transparently, avoiding fabrication, falsification, or plagiarism of data or results\n- Social responsibility considers the broader societal implications and consequences of research, ensuring that studies address important issues and contribute to the public good\n- Institutional review boards (IRBs) review and approve research proposals to ensure they meet ethical standards and protect the rights and welfare of human subjects\n\n## Real-World Applications\n- Clinical trials test the safety and efficacy of new drugs, medical devices, or interventions in human subjects, following a carefully designed protocol and regulatory guidelines\n- Educational research evaluates the effectiveness of different teaching methods, curricula, or educational policies on student learning outcomes and achievement\n- Environmental studies investigate the impact of human activities or natural processes on ecosystems, biodiversity, or environmental health, informing conservation and management strategies\n- Market research assesses consumer preferences, behaviors, and opinions to inform product development, pricing, or advertising strategies for businesses\n- Psychological research explores the underlying mechanisms and factors influencing human cognition, emotion, and behavior, contributing to the understanding and treatment of mental health issues\n- Public health research examines the distribution and determinants of health and disease in populations, informing the development of interventions and policies to improve health outcomes\n- Social science research investigates social phenomena, such as social interactions, cultural practices, or political attitudes, to understand and address societal issues and challenges\n- Sports science research applies scientific principles to enhance athletic performance, prevent injuries, and optimize training and recovery strategies for athletes and coaches","active":true,"order":8,"meta":{"title":"Experimental Design | Intro to Biostatistics Class Notes","description":"Study guides to review Experimental Design. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"pFXBLuu48ndsErJT","type":"STUDY_GUIDE","title":"8.1 Randomization","slug":"randomization","date":null,"keyTopics":[],"publicId":"pFXBLuu48ndsErJT","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["kI0CCn4jc0t7dxuk"],"duration":9},{"id":"PtDgoky63gDBaOrp","type":"STUDY_GUIDE","title":"8.2 Blinding","slug":"blinding","date":null,"keyTopics":[],"publicId":"PtDgoky63gDBaOrp","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["DgQB7MWohT4v1ZNr"],"duration":10},{"id":"PtNWzA6JlwMX79DD","type":"STUDY_GUIDE","title":"8.5 Factorial designs","slug":"factorial-designs","date":null,"keyTopics":[],"publicId":"PtNWzA6JlwMX79DD","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["KnljVYQS1grvhyRT"],"duration":9},{"id":"nq1omVcRclG3w7vG","type":"STUDY_GUIDE","title":"8.3 Control groups","slug":"control-groups","date":null,"keyTopics":[],"publicId":"nq1omVcRclG3w7vG","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["XtmSu7Z1mdx7etgP"],"duration":8},{"id":"IgNoPSwD4JdUPNzl","type":"STUDY_GUIDE","title":"8.4 Sample size determination","slug":"sample-size-determination","date":null,"keyTopics":[],"publicId":"IgNoPSwD4JdUPNzl","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["ocI2QkK71kyYWIVu"],"duration":6}],"numResources":1},{"id":"kVVMucz579uvt2r7","name":"Unit 9 – Survival Analysis","emoji":"📚","slug":"unit-9","description":"Unit 9: Survival Analysis","intro":"Survival analysis is a crucial statistical method in biomedical research, focusing on time-to-event data. It allows researchers to study the timing of events like disease onset or treatment outcomes, even when some subjects haven't experienced the event by the study's end.\n\nKey concepts include censoring, survival and hazard functions, and the Kaplan-Meier method. The Cox proportional hazards model is widely used to analyze the effects of multiple factors on survival. These tools help researchers compare treatments, identify risk factors, and develop prognostic models in various medical fields.","overview":"## What's Survival Analysis?\n- Branch of statistics focused on analyzing time-to-event data where the outcome variable is the time until an event of interest occurs\n- Commonly used in medical research to study the effectiveness of treatments, risk factors for disease, and prognostic factors\n- Allows for the inclusion of censored data, which occurs when the event of interest has not been observed for a subject during the study period\n- Differs from other statistical methods as it accounts for the fact that the event of interest may not have occurred for all subjects by the end of the study\n- Provides insights into the probability of an event occurring over time and the factors that influence this probability\n- Enables researchers to compare survival patterns between different groups (treatment vs. control) and identify risk factors associated with the event of interest\n- Offers a flexible framework for handling various types of censoring (right, left, or interval) and accommodating time-dependent covariates\n\n## Key Concepts and Terms\n- Event: The specific occurrence of interest, such as death, disease recurrence, or equipment failure\n- Survival time: The duration from the starting point (e.g., diagnosis or treatment initiation) until the event occurs or the subject is censored\n- Censoring: Occurs when the event of interest has not been observed for a subject during the study period\n - Right censoring: The most common type, where the subject is still event-free at the end of the study or is lost to follow-up\n - Left censoring: When the event of interest has already occurred before the subject is included in the study\n - Interval censoring: When the event is known to have occurred within a specific time interval, but the exact time is unknown\n- Survival function $S(t)$: The probability that an individual survives beyond time $t$\n- Hazard function $h(t)$: The instantaneous rate of experiencing the event at time $t$, given that the individual has survived up to that point\n- Kaplan-Meier estimator: A non-parametric method for estimating the survival function from observed survival times, accounting for censoring\n- Cox proportional hazards model: A semi-parametric regression model that relates the hazard function to a set of covariates, assuming that the hazard ratios between groups remain constant over time\n\n## Types of Survival Data\n- Right-censored data: The most common type, where the event of interest has not occurred by the end of the study period or the subject is lost to follow-up\n- Left-censored data: When the event of interest has already occurred before the subject is included in the study\n- Interval-censored data: When the event is known to have occurred within a specific time interval, but the exact time is unknown\n- Truncated data: When subjects are not included in the study until they have reached a certain point in their survival time\n - Left truncation: Subjects are only included if they have survived up to a specific time point\n - Right truncation: Subjects are only included if the event occurs before a specific time point\n- Competing risks data: When subjects are at risk of experiencing multiple, mutually exclusive events (e.g., death from different causes)\n- Recurrent event data: When the event of interest can occur multiple times for the same subject (e.g., asthma attacks or hospital readmissions)\n\n## Survival and Hazard Functions\n- Survival function $S(t)$ represents the probability that an individual survives beyond time $t$\n - Defined as $S(t) = P(T > t)$, where $T$ is the survival time\n - Ranges from 1 at the start of the study (when $t = 0$) to 0 as $t$ approaches infinity\n - Can be estimated non-parametrically using the Kaplan-Meier method or parametrically by assuming a specific distribution for the survival times (e.g., exponential, Weibull, or log-normal)\n- Hazard function $h(t)$ represents the instantaneous rate of experiencing the event at time $t$, given that the individual has survived up to that point\n - Defined as $h(t) = \\lim_{\\Delta t \\to 0} \\frac{P(t \\leq T < t + \\Delta t | T \\geq t)}{\\Delta t}$\n - Provides insights into how the risk of the event changes over time\n - Can be modeled using the Cox proportional hazards model or parametric models (e.g., exponential, Weibull, or log-normal)\n- The survival and hazard functions are related through the cumulative hazard function $H(t)$, which is defined as the integral of the hazard function from 0 to $t$\n - The relationship between $S(t)$ and $H(t)$ is given by $S(t) = \\exp(-H(t))$\n\n## Kaplan-Meier Method\n- Non-parametric method for estimating the survival function from observed survival times, accounting for censoring\n- Calculates the probability of surviving beyond each observed event time, conditional on having survived up to that point\n- Estimates the survival function as a step function, with drops at each observed event time\n- Formula for the Kaplan-Meier estimator:\n - Let $t_1 < t_2 < ... < t_k$ be the distinct observed event times\n - Let $d_i$ be the number of events at time $t_i$ and $n_i$ be the number of individuals at risk just prior to time $t_i$\n - The Kaplan-Meier estimator is given by $\\hat{S}(t) = \\prod_{i: t_i \\leq t} (1 - \\frac{d_i}{n_i})$\n- Provides a visual representation of the survival experience of the study population\n- Allows for the comparison of survival curves between different groups using the log-rank test\n- Limitations include the inability to incorporate covariates directly and the assumption that censoring is non-informative\n\n## Cox Proportional Hazards Model\n- Semi-parametric regression model that relates the hazard function to a set of covariates\n- Assumes that the hazard ratios between groups remain constant over time (proportional hazards assumption)\n- The model is defined as $h(t|X) = h_0(t) \\exp(\\beta_1 X_1 + \\beta_2 X_2 + ... + \\beta_p X_p)$, where:\n - $h(t|X)$ is the hazard function for an individual with covariate values $X = (X_1, X_2, ..., X_p)$\n - $h_0(t)$ is the baseline hazard function, which is left unspecified\n - $\\beta_1, \\beta_2, ..., \\beta_p$ are the regression coefficients that quantify the effect of each covariate on the hazard\n- Coefficients are estimated using partial likelihood, which accounts for the ordering of the event times without specifying the baseline hazard function\n- Hazard ratios (exp($\\beta_i$)) represent the multiplicative effect of a one-unit increase in the corresponding covariate on the hazard, assuming all other covariates remain constant\n- Allows for the inclusion of both continuous and categorical covariates\n- Can be extended to incorporate time-dependent covariates and stratification factors\n- Model assumptions (proportional hazards, linearity, and non-informative censoring) should be assessed before interpreting the results\n\n## Interpreting Survival Analysis Results\n- Kaplan-Meier curves provide a visual representation of the survival experience over time\n - Steep drops indicate time points with a high number of events\n - Wide gaps between curves suggest a substantial difference in survival between groups\n - Crossing curves may indicate non-proportional hazards\n- Log-rank test assesses the statistical significance of the difference in survival curves between groups\n - A small p-value (typically < 0.05) suggests a significant difference in survival\n- Cox proportional hazards model results are typically presented as hazard ratios (HR) with 95% confidence intervals (CI)\n - An HR > 1 indicates an increased risk of the event for the corresponding covariate, while an HR < 1 indicates a decreased risk\n - The 95% CI provides a range of plausible values for the true HR; if the CI does not include 1, the covariate is considered statistically significant\n- Proportional hazards assumption can be assessed using graphical methods (e.g., log-log survival plots or Schoenfeld residuals) or statistical tests (e.g., time-dependent covariates or Grambsch-Therneau test)\n- Model fit can be evaluated using measures such as the likelihood ratio test, Wald test, or score test\n- Results should be interpreted in the context of the study design, population, and research question, considering potential confounding factors and limitations\n\n## Real-World Applications in Biostatistics\n- Clinical trials: Evaluating the efficacy and safety of new treatments or interventions\n - Comparing survival outcomes between treatment and control groups\n - Identifying subgroups of patients who may benefit more from a specific treatment\n- Epidemiological studies: Investigating risk factors for disease onset or progression\n - Assessing the impact of lifestyle factors (smoking, diet, physical activity) on disease-free survival\n - Examining the role of genetic or environmental factors in the development of chronic diseases\n- Prognostic studies: Developing models to predict patient outcomes based on clinical, demographic, or molecular characteristics\n - Identifying prognostic biomarkers that can stratify patients into risk groups\n - Developing risk scores or nomograms to aid in treatment decision-making\n- Public health research: Analyzing population-level trends in disease incidence, prevalence, and mortality\n - Evaluating the effectiveness of screening programs or public health interventions\n - Investigating disparities in health outcomes across different socioeconomic or racial/ethnic groups\n- Reliability analysis: Assessing the durability or failure rates of medical devices or equipment\n - Comparing the performance of different device designs or materials\n - Identifying factors that contribute to device failure or malfunction","active":true,"order":9,"meta":{"title":"Survival Analysis | Intro to Biostatistics Class Notes","description":"Study guides to review Survival Analysis. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"e4D2tkFWN44Q8fNq","type":"STUDY_GUIDE","title":"9.1 Kaplan-Meier estimator","slug":"kaplan-meier-estimator","date":null,"keyTopics":[],"publicId":"e4D2tkFWN44Q8fNq","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["dLaZetw1IBcSGWvp"],"duration":8},{"id":"HGGR4TjJqb7DiHQy","type":"STUDY_GUIDE","title":"9.3 Censoring","slug":"censoring","date":null,"keyTopics":[],"publicId":"HGGR4TjJqb7DiHQy","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["o3vxh2xss8vnbdpS"],"duration":11},{"id":"tbsbuwbhiTDaFU6Z","type":"STUDY_GUIDE","title":"9.5 Hazard ratios","slug":"hazard-ratios","date":null,"keyTopics":[],"publicId":"tbsbuwbhiTDaFU6Z","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["e0hv8wfNvtaW7h5Y"],"duration":3},{"id":"EfuaY8jSNC9ZUNSr","type":"STUDY_GUIDE","title":"9.4 Log-rank test","slug":"log-rank-test","date":null,"keyTopics":[],"publicId":"EfuaY8jSNC9ZUNSr","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["upIBdqhhZXCUj0kJ"],"duration":5},{"id":"IiPy3TSXkudZ28cm","type":"STUDY_GUIDE","title":"9.2 Cox proportional hazards model","slug":"cox-proportional-hazards-model","date":null,"keyTopics":[],"publicId":"IiPy3TSXkudZ28cm","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["jtj7lPiGdH90Yxro"],"duration":8}],"numResources":1},{"id":"FD86VDr2IzTFDYGz","name":"Unit 10 – Epidemiological Measures","emoji":"📚","slug":"unit-10","description":"Unit 10: Epidemiological Measures","intro":"Epidemiological measures are essential tools for studying health events in populations. They help quantify disease frequency, identify risk factors, and assess the impact of public health interventions. These measures include incidence, prevalence, and mortality rates, which provide insights into disease patterns and trends.\n\nUnderstanding epidemiological measures is crucial for public health professionals and researchers. By applying these concepts, they can design effective studies, interpret results accurately, and make informed decisions about disease prevention and control strategies. Mastering these measures enables better assessment of population health and guides evidence-based public health policies.","overview":"## What's This Unit All About?\n- Epidemiological measures provide a quantitative approach to studying the distribution and determinants of health-related events in populations\n- Focuses on the frequency, pattern, and causes of health outcomes in defined populations (communities, countries, global)\n- Enables public health professionals to identify risk factors for disease and targets for preventive healthcare\n- Measures used to assess the burden of disease in a population include incidence, prevalence, and mortality rates\n- Key concepts include population at risk, exposure, outcome, and time period\n- Epidemiological studies can be observational (cohort, case-control, cross-sectional) or experimental (randomized controlled trials)\n- Findings from epidemiological studies inform public health policies, programs, and interventions to control and prevent disease\n\n## Key Concepts You Need to Know\n- Population at risk: The group of people, healthy or sick, who would be counted as cases if they had the disease being studied\n- Incidence: The number of new cases of a disease in a population over a specified period of time\n - Incidence rate: The number of new cases of a disease divided by the population at risk over a specified time period\n- Prevalence: The proportion of a population that has a disease at a specific point in time\n - Point prevalence: The proportion of a population that has the disease at a specific point in time\n - Period prevalence: The proportion of a population that has the disease at any time during a given period\n- Mortality rate: The number of deaths due to a disease divided by the total population over a specified time period\n- Risk factor: A characteristic, condition, or behavior that increases the likelihood of developing a disease\n- Relative risk (RR): The ratio of the risk of disease in exposed individuals to the risk in unexposed individuals\n- Odds ratio (OR): The ratio of the odds of exposure in cases to the odds of exposure in controls\n- Attributable risk (AR): The difference in risk between exposed and unexposed individuals\n\n## Types of Epidemiological Measures\n- Incidence measures: Focus on new cases of disease over a specified time period\n - Cumulative incidence: The proportion of a population at risk that develops the disease over a specified time period\n - Incidence rate: The number of new cases of a disease divided by the population at risk over a specified time period\n- Prevalence measures: Focus on existing cases of disease at a specific point or period in time\n - Point prevalence: The proportion of a population that has the disease at a specific point in time\n - Period prevalence: The proportion of a population that has the disease at any time during a given period\n- Mortality measures: Focus on deaths due to a specific disease\n - Crude mortality rate: The number of deaths due to a disease divided by the total population over a specified time period\n - Case fatality rate: The proportion of individuals diagnosed with a disease who die from that disease over a specified time period\n- Measures of association: Quantify the relationship between an exposure and an outcome\n - Relative risk (RR): The ratio of the risk of disease in exposed individuals to the risk in unexposed individuals\n - Odds ratio (OR): The ratio of the odds of exposure in cases to the odds of exposure in controls\n- Measures of impact: Assess the public health burden of a risk factor\n - Attributable risk (AR): The difference in risk between exposed and unexposed individuals\n - Population attributable risk (PAR): The portion of disease in a population that can be attributed to a specific exposure\n\n## Crunching the Numbers: Calculations and Formulas\n- Incidence rate: $\\frac{\\text{Number of new cases}}{\\text{Population at risk}} \\times \\text{Time period}$\n- Cumulative incidence: $\\frac{\\text{Number of new cases}}{\\text{Population at risk}} \\times 100\\%$\n- Point prevalence: $\\frac{\\text{Number of existing cases}}{\\text{Total population}} \\times 100\\%$\n- Period prevalence: $\\frac{\\text{Number of existing cases during period}}{\\text{Average population during period}} \\times 100\\%$\n- Crude mortality rate: $\\frac{\\text{Number of deaths}}{\\text{Total population}} \\times \\text{Time period}$\n- Case fatality rate: $\\frac{\\text{Number of deaths from disease}}{\\text{Number of diagnosed cases}} \\times 100\\%$\n- Relative risk (RR): $\\frac{\\text{Risk in exposed}}{\\text{Risk in unexposed}}$\n- Odds ratio (OR): $\\frac{\\text{Odds of exposure in cases}}{\\text{Odds of exposure in controls}}$\n- Attributable risk (AR): $\\text{Risk in exposed} - \\text{Risk in unexposed}$\n- Population attributable risk (PAR): $\\frac{\\text{Total cases} - \\text{Cases in unexposed}}{\\text{Total cases}} \\times 100\\%$\n\n## Real-World Applications\n- Outbreak investigations: Epidemiological measures are used to identify the source and extent of disease outbreaks (Ebola, COVID-19)\n- Disease surveillance: Ongoing systematic collection, analysis, and interpretation of health data to inform public health action\n- Vaccine efficacy studies: Measures like relative risk and attributable risk are used to assess the effectiveness of vaccines in preventing disease (HPV vaccine)\n- Environmental health studies: Epidemiological measures help quantify the impact of environmental exposures on health outcomes (air pollution and respiratory diseases)\n- Health disparities research: Measures like prevalence and mortality rates can highlight health inequalities among different populations (racial/ethnic disparities in chronic diseases)\n- Clinical decision-making: Measures of association, such as relative risk and odds ratio, inform clinical guidelines and decision-making (screening recommendations for cancers)\n\n## Common Pitfalls and How to Avoid Them\n- Selection bias: Occurs when the study population is not representative of the target population\n - Ensure that the study sample is randomly selected and representative of the target population\n- Information bias: Occurs when there are systematic differences in the way data is collected from different groups\n - Use standardized data collection methods and validate measurement tools\n- Confounding: Occurs when an extraneous factor is associated with both the exposure and the outcome, distorting the true relationship\n - Identify potential confounders and adjust for them in the analysis using methods like stratification or multivariate regression\n- Ecological fallacy: Occurs when inferences about individual-level associations are made based on group-level data\n - Use individual-level data whenever possible and be cautious when interpreting group-level associations\n- Reverse causality: Occurs when the outcome precedes the exposure in time, leading to a misinterpretation of the causal relationship\n - Ensure that the exposure precedes the outcome in time and consider alternative explanations for the observed association\n\n## Pro Tips for Acing This Unit\n- Practice calculating epidemiological measures using real data sets to reinforce your understanding of the concepts and formulas\n- Pay attention to the units and time periods used in each measure to ensure you are interpreting the results correctly\n- Use visual aids like 2x2 tables and flow diagrams to organize information and clarify relationships between variables\n- Focus on understanding the key differences between measures like incidence and prevalence, relative risk and odds ratio, and attributable risk and population attributable risk\n- Apply your knowledge of epidemiological measures to real-world public health scenarios to appreciate their practical importance and relevance\n- Stay up-to-date with current public health issues and research to see how epidemiological measures are used to inform policy and practice\n\n## Beyond the Basics: Advanced Topics\n- Effect modification: Occurs when the association between an exposure and an outcome varies by levels of a third variable (age, sex)\n - Assess effect modification by stratifying the analysis by the potential modifier and comparing the measures of association across strata\n- Interaction: Occurs when the joint effect of two exposures on an outcome is different from the sum of their individual effects\n - Evaluate interaction by including an interaction term in the regression model and testing its statistical significance\n- Causal inference: The process of determining whether an observed association between an exposure and an outcome represents a causal relationship\n - Use criteria like temporality, strength of association, dose-response, and biological plausibility to assess causality\n- Directed acyclic graphs (DAGs): Visual representations of the causal relationships between variables in a study\n - Construct DAGs to identify potential confounders, mediators, and colliders and guide the selection of variables for adjustment in the analysis\n- Advanced study designs: Nested case-control studies, case-cohort studies, and randomized controlled trials with unique sampling and analysis considerations\n - Understand the strengths and limitations of each design and the appropriate measures of association and impact to use in each context","active":true,"order":10,"meta":{"title":"Epidemiological Measures | Intro to Biostatistics Class Notes","description":"Study guides to review Epidemiological Measures. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"lpPmnlnjuaEDOqrx","type":"STUDY_GUIDE","title":"10.2 Relative risk","slug":"relative-risk","date":null,"keyTopics":[],"publicId":"lpPmnlnjuaEDOqrx","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["EDZkKiTVdqfeqxhh"],"duration":8},{"id":"OaxQ7C81fpnqbwej","type":"STUDY_GUIDE","title":"10.4 Sensitivity and specificity","slug":"sensitivity-specificity","date":null,"keyTopics":[],"publicId":"OaxQ7C81fpnqbwej","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["6cfq5fQM5dWWgByR"],"duration":10},{"id":"K0eQFC2DjGgjdIbx","type":"STUDY_GUIDE","title":"10.1 Incidence and prevalence","slug":"incidence-prevalence","date":null,"keyTopics":[],"publicId":"K0eQFC2DjGgjdIbx","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["W3seymb8cpA0MImW"],"duration":8},{"id":"4hpy9rn5TvyC9J8w","type":"STUDY_GUIDE","title":"10.3 Odds ratio","slug":"odds-ratio","date":null,"keyTopics":[],"publicId":"4hpy9rn5TvyC9J8w","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["JGKXDxo4i9UbKdO1"],"duration":8},{"id":"EsfutmLq0YnqH0xX","type":"STUDY_GUIDE","title":"10.5 Attributable risk","slug":"attributable-risk","date":null,"keyTopics":[],"publicId":"EsfutmLq0YnqH0xX","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["cu1kUDGl3BLhlCPY"],"duration":6}],"numResources":1},{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","description":"Unit 11: Statistical Software and Data Management","intro":"Statistical software and data management are crucial skills in biostatistics. These tools enable researchers to analyze complex datasets, perform statistical tests, and visualize results. Mastering software like R or SAS empowers biostatisticians to handle large datasets efficiently and conduct sophisticated analyses.\n\nData management involves organizing, cleaning, and preparing data for analysis. This process ensures data quality and consistency, which are essential for accurate results. Proper data management practices also facilitate collaboration, reproducibility, and compliance with ethical and legal standards in biomedical research.","overview":"## Key Statistical Concepts\n- Understand the difference between descriptive statistics summarizes and describes the basic features of a dataset and inferential statistics uses sample data to make inferences about a larger population\n- Recognize the importance of measures of central tendency (mean, median, mode) provides information about the typical or central value in a dataset\n- Differentiate between measures of dispersion (range, variance, standard deviation) quantifies the amount of variation or spread in a dataset\n- Identify the properties of normal distribution a symmetric, bell-shaped curve with a well-defined mean and standard deviation\n - Approximately 68% of data falls within one standard deviation of the mean, 95% within two standard deviations, and 99.7% within three standard deviations\n- Comprehend the concept of hypothesis testing a statistical method used to make decisions or draw conclusions about a population based on sample data\n- Distinguish between null hypothesis (assumes no significant difference or effect) and alternative hypothesis (assumes a significant difference or effect)\n- Interpret p-values the probability of obtaining the observed results or more extreme results, assuming the null hypothesis is true\n - A small p-value (typically < 0.05) suggests strong evidence against the null hypothesis, while a large p-value (> 0.05) indicates weak evidence against the null hypothesis\n\n## Introduction to Statistical Software\n- Familiarize yourself with popular statistical software packages (R, Python, SAS, SPSS, Stata) used for data analysis, visualization, and statistical modeling in biostatistics\n- Understand the benefits of using statistical software automates complex calculations, handles large datasets, and provides a wide range of built-in statistical functions and tests\n- Learn the basic syntax and commands of the chosen statistical software to perform data manipulation, analysis, and visualization tasks\n- Explore the integrated development environment (IDE) or graphical user interface (GUI) of the software to navigate through different features and options\n- Utilize built-in help documentation, online resources, and community forums to troubleshoot issues and learn advanced techniques in the statistical software\n- Practice importing and exporting various data file formats (CSV, Excel, JSON) to and from the statistical software\n- Discover the power of libraries or packages in extending the functionality of the statistical software by providing additional tools and methods for specific analysis tasks\n\n## Data Types and Structures\n- Understand the different data types (numeric, character, logical, factor) used to represent variables in statistical software\n - Numeric data represents quantitative values (integers or floating-point numbers)\n - Character data represents text or string values\n - Logical data represents binary values (TRUE or FALSE)\n - Factor data represents categorical variables with a fixed number of levels or categories\n- Recognize the importance of data structures (vectors, matrices, data frames) in organizing and manipulating data in statistical software\n - Vectors are one-dimensional arrays that hold elements of the same data type\n - Matrices are two-dimensional arrays with rows and columns, where all elements are of the same data type\n - Data frames are two-dimensional structures similar to matrices but can hold columns of different data types\n- Learn how to create, access, and modify elements within data structures using indexing and subsetting techniques\n- Understand the concept of missing values (NA, NaN) and how they are handled in different data structures and statistical analyses\n- Explore advanced data structures (lists, arrays, tibbles) that provide additional flexibility and functionality for complex data manipulation tasks\n- Practice reshaping data between wide and long formats using functions like `pivot_longer()` and `pivot_wider()` to facilitate different types of analyses\n\n## Data Cleaning and Preprocessing\n- Recognize the importance of data cleaning and preprocessing ensures data quality, consistency, and suitability for analysis\n- Identify and handle missing values through techniques like deletion, imputation, or interpolation based on the nature of the missing data and the analysis requirements\n- Detect and resolve inconsistencies, errors, and outliers in the dataset using summary statistics, visualization, or domain knowledge\n- Perform data type conversions (numeric to factor, character to date) to ensure variables are in the appropriate format for analysis\n- Apply data normalization or standardization techniques (z-score, min-max scaling) to bring variables to a common scale or distribution\n- Conduct data transformations (log, square root, Box-Cox) to improve the normality, linearity, or homoscedasticity of variables\n- Merge or join multiple datasets based on common variables or keys to create a unified dataset for analysis\n- Subset or filter data based on specific conditions or criteria to focus on relevant observations or variables\n\n## Descriptive Statistics and Visualization\n- Calculate and interpret measures of central tendency (mean, median, mode) to summarize the typical or central value in a dataset\n- Compute and interpret measures of dispersion (range, variance, standard deviation) to assess the variability or spread of the data\n- Generate frequency tables and contingency tables to summarize categorical variables and their relationships\n- Create informative visualizations (histograms, box plots, scatter plots, bar charts) to explore the distribution, relationships, and patterns in the data\n - Histograms display the distribution of a continuous variable using bins and frequencies\n - Box plots provide a summary of the five-number summary (minimum, first quartile, median, third quartile, maximum) and identify outliers\n - Scatter plots show the relationship between two continuous variables\n - Bar charts compare the frequencies or proportions of categorical variables\n- Customize visualizations by modifying plot elements (titles, labels, colors, scales) to enhance clarity and aesthetics\n- Apply data transformations or faceting techniques to create more informative and targeted visualizations\n- Interpret and communicate insights from descriptive statistics and visualizations to stakeholders or decision-makers\n\n## Basic Statistical Analyses\n- Perform hypothesis tests (t-tests, ANOVA, chi-square) to make inferences about population parameters based on sample data\n - T-tests compare means between two groups or against a known value\n - ANOVA (Analysis of Variance) compares means across multiple groups\n - Chi-square tests assess the association between categorical variables\n- Conduct correlation analysis to measure the strength and direction of the linear relationship between two continuous variables\n- Apply regression analysis (linear, logistic, Poisson) to model the relationship between a dependent variable and one or more independent variables\n - Linear regression models the relationship between a continuous dependent variable and one or more independent variables\n - Logistic regression models the probability of a binary outcome based on one or more independent variables\n - Poisson regression models the count of events based on one or more independent variables\n- Interpret the results of statistical analyses, including coefficients, p-values, confidence intervals, and goodness-of-fit measures\n- Assess the assumptions and diagnostics of statistical models to ensure the validity and reliability of the results\n- Apply appropriate post-hoc tests or corrections (Bonferroni, Tukey) to control for multiple comparisons or Type I error\n\n## Data Management Best Practices\n- Develop a clear and consistent naming convention for variables, files, and folders to ensure easy identification and organization\n- Use version control systems (Git, SVN) to track changes, collaborate with others, and maintain a history of the data and analysis files\n- Implement a structured and hierarchical folder organization system to store raw data, processed data, scripts, and output files separately\n- Document data sources, transformations, and analysis steps using README files, codebooks, or data dictionaries to ensure reproducibility and transparency\n- Regularly backup and store data in secure and reliable storage systems (cloud storage, external hard drives) to prevent data loss or corruption\n- Anonymize or de-identify sensitive or confidential data to protect privacy and comply with ethical and legal requirements\n- Validate and verify data integrity through checks for completeness, consistency, and accuracy\n- Establish data access and sharing protocols to control who can access, modify, or distribute the data based on roles and permissions\n\n## Practical Applications in Biostatistics\n- Epidemiological studies: Apply statistical methods to investigate the distribution, determinants, and control of health-related states or events in specified populations\n - Calculate measures of disease frequency (prevalence, incidence) and association (relative risk, odds ratio)\n - Conduct cohort studies or case-control studies to identify risk factors or protective factors for diseases\n- Clinical trials: Design and analyze experiments to evaluate the safety and efficacy of new medical interventions (drugs, devices, therapies)\n - Determine sample size and power calculations to ensure adequate statistical power\n - Perform randomization and blinding techniques to minimize bias and confounding\n - Analyze treatment effects using appropriate statistical tests and models\n- Survival analysis: Investigate the time until the occurrence of an event of interest (death, relapse, recovery) and factors influencing survival probabilities\n - Estimate survival functions using Kaplan-Meier curves or Cox proportional hazards models\n - Compare survival distributions between groups using log-rank tests or Cox regression\n- Diagnostic tests: Evaluate the performance and accuracy of diagnostic tests in detecting or ruling out a disease or condition\n - Calculate sensitivity, specificity, positive predictive value, and negative predictive value\n - Construct and interpret receiver operating characteristic (ROC) curves to assess the trade-off between sensitivity and specificity\n- Genomic data analysis: Apply statistical methods to analyze and interpret high-dimensional genomic data (gene expression, DNA methylation, single nucleotide polymorphisms)\n - Perform differential expression analysis to identify genes associated with a particular condition or treatment\n - Conduct pathway analysis or gene set enrichment analysis to identify biological processes or functions overrepresented in a gene list\n- Meta-analysis: Combine and synthesize results from multiple independent studies to obtain a more precise and comprehensive estimate of an effect or association\n - Assess heterogeneity between studies using statistical tests (Cochran's Q, I-squared)\n - Estimate pooled effect sizes using fixed-effect or random-effects models based on the assumption of heterogeneity","active":true,"order":11,"meta":{"title":"Statistical Software & Data Management | Intro to Biostatistics Class Notes","description":"Study guides to review Statistical Software & Data Management. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"uuCg3qOSetbDAukb","type":"STUDY_GUIDE","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","date":null,"keyTopics":[],"publicId":"uuCg3qOSetbDAukb","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["cLVyPGrESNRSeIqJ"],"duration":8},{"id":"ECob8I1hDH2SslWt","type":"STUDY_GUIDE","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","date":null,"keyTopics":[],"publicId":"ECob8I1hDH2SslWt","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["kU83WaOxOyIAfXYs"],"duration":8},{"id":"fltCwHwrmxFkPzLD","type":"STUDY_GUIDE","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","date":null,"keyTopics":[],"publicId":"fltCwHwrmxFkPzLD","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["PspxC0zWdY3Ndq9l"],"duration":8},{"id":"UdImkGkjLzRBsQ7s","type":"STUDY_GUIDE","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","date":null,"keyTopics":[],"publicId":"UdImkGkjLzRBsQ7s","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["bymkizpzgeeLAQqX"],"duration":11},{"id":"X9OMg6UVm28mXVLC","type":"STUDY_GUIDE","title":"11.3 Data visualization tools","slug":"data-visualization-tools","date":null,"keyTopics":[],"publicId":"X9OMg6UVm28mXVLC","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["oSTjCwIqnx2RvxdE"],"duration":12}],"numResources":1}],"exams":[]},"unit":{"id":"Gu8eBjmCYzyZOqH8","name":"Unit 11 – Statistical Software & Data Management","emoji":"📚","slug":"unit-11","description":"Unit 11: Statistical Software and Data Management","intro":"Statistical software and data management are crucial skills in biostatistics. These tools enable researchers to analyze complex datasets, perform statistical tests, and visualize results. Mastering software like R or SAS empowers biostatisticians to handle large datasets efficiently and conduct sophisticated analyses.\n\nData management involves organizing, cleaning, and preparing data for analysis. This process ensures data quality and consistency, which are essential for accurate results. Proper data management practices also facilitate collaboration, reproducibility, and compliance with ethical and legal standards in biomedical research.","overview":"## Key Statistical Concepts\n- Understand the difference between descriptive statistics summarizes and describes the basic features of a dataset and inferential statistics uses sample data to make inferences about a larger population\n- Recognize the importance of measures of central tendency (mean, median, mode) provides information about the typical or central value in a dataset\n- Differentiate between measures of dispersion (range, variance, standard deviation) quantifies the amount of variation or spread in a dataset\n- Identify the properties of normal distribution a symmetric, bell-shaped curve with a well-defined mean and standard deviation\n - Approximately 68% of data falls within one standard deviation of the mean, 95% within two standard deviations, and 99.7% within three standard deviations\n- Comprehend the concept of hypothesis testing a statistical method used to make decisions or draw conclusions about a population based on sample data\n- Distinguish between null hypothesis (assumes no significant difference or effect) and alternative hypothesis (assumes a significant difference or effect)\n- Interpret p-values the probability of obtaining the observed results or more extreme results, assuming the null hypothesis is true\n - A small p-value (typically < 0.05) suggests strong evidence against the null hypothesis, while a large p-value (> 0.05) indicates weak evidence against the null hypothesis\n\n## Introduction to Statistical Software\n- Familiarize yourself with popular statistical software packages (R, Python, SAS, SPSS, Stata) used for data analysis, visualization, and statistical modeling in biostatistics\n- Understand the benefits of using statistical software automates complex calculations, handles large datasets, and provides a wide range of built-in statistical functions and tests\n- Learn the basic syntax and commands of the chosen statistical software to perform data manipulation, analysis, and visualization tasks\n- Explore the integrated development environment (IDE) or graphical user interface (GUI) of the software to navigate through different features and options\n- Utilize built-in help documentation, online resources, and community forums to troubleshoot issues and learn advanced techniques in the statistical software\n- Practice importing and exporting various data file formats (CSV, Excel, JSON) to and from the statistical software\n- Discover the power of libraries or packages in extending the functionality of the statistical software by providing additional tools and methods for specific analysis tasks\n\n## Data Types and Structures\n- Understand the different data types (numeric, character, logical, factor) used to represent variables in statistical software\n - Numeric data represents quantitative values (integers or floating-point numbers)\n - Character data represents text or string values\n - Logical data represents binary values (TRUE or FALSE)\n - Factor data represents categorical variables with a fixed number of levels or categories\n- Recognize the importance of data structures (vectors, matrices, data frames) in organizing and manipulating data in statistical software\n - Vectors are one-dimensional arrays that hold elements of the same data type\n - Matrices are two-dimensional arrays with rows and columns, where all elements are of the same data type\n - Data frames are two-dimensional structures similar to matrices but can hold columns of different data types\n- Learn how to create, access, and modify elements within data structures using indexing and subsetting techniques\n- Understand the concept of missing values (NA, NaN) and how they are handled in different data structures and statistical analyses\n- Explore advanced data structures (lists, arrays, tibbles) that provide additional flexibility and functionality for complex data manipulation tasks\n- Practice reshaping data between wide and long formats using functions like `pivot_longer()` and `pivot_wider()` to facilitate different types of analyses\n\n## Data Cleaning and Preprocessing\n- Recognize the importance of data cleaning and preprocessing ensures data quality, consistency, and suitability for analysis\n- Identify and handle missing values through techniques like deletion, imputation, or interpolation based on the nature of the missing data and the analysis requirements\n- Detect and resolve inconsistencies, errors, and outliers in the dataset using summary statistics, visualization, or domain knowledge\n- Perform data type conversions (numeric to factor, character to date) to ensure variables are in the appropriate format for analysis\n- Apply data normalization or standardization techniques (z-score, min-max scaling) to bring variables to a common scale or distribution\n- Conduct data transformations (log, square root, Box-Cox) to improve the normality, linearity, or homoscedasticity of variables\n- Merge or join multiple datasets based on common variables or keys to create a unified dataset for analysis\n- Subset or filter data based on specific conditions or criteria to focus on relevant observations or variables\n\n## Descriptive Statistics and Visualization\n- Calculate and interpret measures of central tendency (mean, median, mode) to summarize the typical or central value in a dataset\n- Compute and interpret measures of dispersion (range, variance, standard deviation) to assess the variability or spread of the data\n- Generate frequency tables and contingency tables to summarize categorical variables and their relationships\n- Create informative visualizations (histograms, box plots, scatter plots, bar charts) to explore the distribution, relationships, and patterns in the data\n - Histograms display the distribution of a continuous variable using bins and frequencies\n - Box plots provide a summary of the five-number summary (minimum, first quartile, median, third quartile, maximum) and identify outliers\n - Scatter plots show the relationship between two continuous variables\n - Bar charts compare the frequencies or proportions of categorical variables\n- Customize visualizations by modifying plot elements (titles, labels, colors, scales) to enhance clarity and aesthetics\n- Apply data transformations or faceting techniques to create more informative and targeted visualizations\n- Interpret and communicate insights from descriptive statistics and visualizations to stakeholders or decision-makers\n\n## Basic Statistical Analyses\n- Perform hypothesis tests (t-tests, ANOVA, chi-square) to make inferences about population parameters based on sample data\n - T-tests compare means between two groups or against a known value\n - ANOVA (Analysis of Variance) compares means across multiple groups\n - Chi-square tests assess the association between categorical variables\n- Conduct correlation analysis to measure the strength and direction of the linear relationship between two continuous variables\n- Apply regression analysis (linear, logistic, Poisson) to model the relationship between a dependent variable and one or more independent variables\n - Linear regression models the relationship between a continuous dependent variable and one or more independent variables\n - Logistic regression models the probability of a binary outcome based on one or more independent variables\n - Poisson regression models the count of events based on one or more independent variables\n- Interpret the results of statistical analyses, including coefficients, p-values, confidence intervals, and goodness-of-fit measures\n- Assess the assumptions and diagnostics of statistical models to ensure the validity and reliability of the results\n- Apply appropriate post-hoc tests or corrections (Bonferroni, Tukey) to control for multiple comparisons or Type I error\n\n## Data Management Best Practices\n- Develop a clear and consistent naming convention for variables, files, and folders to ensure easy identification and organization\n- Use version control systems (Git, SVN) to track changes, collaborate with others, and maintain a history of the data and analysis files\n- Implement a structured and hierarchical folder organization system to store raw data, processed data, scripts, and output files separately\n- Document data sources, transformations, and analysis steps using README files, codebooks, or data dictionaries to ensure reproducibility and transparency\n- Regularly backup and store data in secure and reliable storage systems (cloud storage, external hard drives) to prevent data loss or corruption\n- Anonymize or de-identify sensitive or confidential data to protect privacy and comply with ethical and legal requirements\n- Validate and verify data integrity through checks for completeness, consistency, and accuracy\n- Establish data access and sharing protocols to control who can access, modify, or distribute the data based on roles and permissions\n\n## Practical Applications in Biostatistics\n- Epidemiological studies: Apply statistical methods to investigate the distribution, determinants, and control of health-related states or events in specified populations\n - Calculate measures of disease frequency (prevalence, incidence) and association (relative risk, odds ratio)\n - Conduct cohort studies or case-control studies to identify risk factors or protective factors for diseases\n- Clinical trials: Design and analyze experiments to evaluate the safety and efficacy of new medical interventions (drugs, devices, therapies)\n - Determine sample size and power calculations to ensure adequate statistical power\n - Perform randomization and blinding techniques to minimize bias and confounding\n - Analyze treatment effects using appropriate statistical tests and models\n- Survival analysis: Investigate the time until the occurrence of an event of interest (death, relapse, recovery) and factors influencing survival probabilities\n - Estimate survival functions using Kaplan-Meier curves or Cox proportional hazards models\n - Compare survival distributions between groups using log-rank tests or Cox regression\n- Diagnostic tests: Evaluate the performance and accuracy of diagnostic tests in detecting or ruling out a disease or condition\n - Calculate sensitivity, specificity, positive predictive value, and negative predictive value\n - Construct and interpret receiver operating characteristic (ROC) curves to assess the trade-off between sensitivity and specificity\n- Genomic data analysis: Apply statistical methods to analyze and interpret high-dimensional genomic data (gene expression, DNA methylation, single nucleotide polymorphisms)\n - Perform differential expression analysis to identify genes associated with a particular condition or treatment\n - Conduct pathway analysis or gene set enrichment analysis to identify biological processes or functions overrepresented in a gene list\n- Meta-analysis: Combine and synthesize results from multiple independent studies to obtain a more precise and comprehensive estimate of an effect or association\n - Assess heterogeneity between studies using statistical tests (Cochran's Q, I-squared)\n - Estimate pooled effect sizes using fixed-effect or random-effects models based on the assumption of heterogeneity","active":true,"order":11,"meta":{"title":"Statistical Software & Data Management | Intro to Biostatistics Class Notes","description":"Study guides to review Statistical Software & Data Management. For college students taking Intro to Biostatistics."},"metaDesc":null,"resources":[{"id":"uuCg3qOSetbDAukb","type":"STUDY_GUIDE","title":"11.1 Introduction to statistical software packages","slug":"introduction-statistical-software-packages","date":null,"keyTopics":[],"publicId":"uuCg3qOSetbDAukb","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["cLVyPGrESNRSeIqJ"],"duration":8},{"id":"ECob8I1hDH2SslWt","type":"STUDY_GUIDE","title":"11.4 Basic programming concepts","slug":"basic-programming-concepts","date":null,"keyTopics":[],"publicId":"ECob8I1hDH2SslWt","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["kU83WaOxOyIAfXYs"],"duration":8},{"id":"fltCwHwrmxFkPzLD","type":"STUDY_GUIDE","title":"11.5 Reproducible research practices","slug":"reproducible-research-practices","date":null,"keyTopics":[],"publicId":"fltCwHwrmxFkPzLD","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["PspxC0zWdY3Ndq9l"],"duration":8},{"id":"UdImkGkjLzRBsQ7s","type":"STUDY_GUIDE","title":"11.2 Data cleaning and preprocessing","slug":"data-cleaning-preprocessing","date":null,"keyTopics":[],"publicId":"UdImkGkjLzRBsQ7s","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["bymkizpzgeeLAQqX"],"duration":11},{"id":"X9OMg6UVm28mXVLC","type":"STUDY_GUIDE","title":"11.3 Data visualization tools","slug":"data-visualization-tools","date":null,"keyTopics":[],"publicId":"X9OMg6UVm28mXVLC","vimeoLiveLink":null,"url":null,"eventTitle":null,"resources":[],"subject":{"slug":"introduction-to-biostatistics"},"streamers":[],"creators":[],"topicIds":["oSTjCwIqnx2RvxdE"],"duration":12}],"numResources":1}}]}]]