% this next line updated by emacs every time the file is saved:
\def\tsval{Time-stamp: < 13 May 2017 23:15MDT >}
%
%
% Lies, Damned Lies, or Statistics: How to Tell the Truth with Statistics
% by Jonathan A. Poritz
% jonathan@poritz.net and http://www.poritz.net/jonathan
% This work is licensed under the Creative Commons Attribution-
% ShareAlike 4.0 International License. To view a copy
% of this license, visit
% http://creativecommons.org/licenses/by-sa/4.0/.
%
% You are free to share and adapt this book for any purpose, even
% commercially (that's what the CC license is about) but:
% -- you must give appropriate credit to this author and to
% -- you must indicate in your version if you have made changes to our
% originals, and
% -- if you remix, transform, or build upon the material, you must
% distribute your contributions under the same license as the
% original
% -- you may not apply legal terms or technological measures which
% restrict others from doing anything the license permits.
%
% While this is not a legal requirement, I would be happy to know if you are
% using and/or adapting this work: please send such information to my
% e-mail address jonathan.poritz@gmail.com For that matter, if you
% find typos or have suggestions of things to change or add, I would
% be happy to hear that as well.
%
%
%
% --------------------------------------------------------------------------
% Some notes on using this TeX file:
%
% It is standard LaTeX -- my version says
% 'This is pdfTeX, Version 3.1415926-2.4-1.40.13 (TeX Live 2012/Debian)'
% at the start of the run. It does use quite a few packages, but they are
% all available from the standard 'net repositories ... in fact, I installed
% nothing particularly for this book.
%
% Here is how I produce the final ldlos.pdf file on my Linux box
% (on other systems, you will know how that must be changed)
% latex ldlos
% bibtex ldlos
% latex ldlos
% latex ldlos
% makeindex ldlos
% latex ldlos
% dvips -o ldlos.ps ldlos
% ps2pdf ldlos.ps
% view ldlos.pdf in my favorite viewer (I use 'evince')
%
% Note you will need the following files in the directory where you do this:
% ldlos.tex
% refs.bib
% ldlos_cover.eps
% by-sa.eps
% flipsfreq.eps
% flipsrelfreq.eps
% flipsrelfreqcheat.eps
% flipspiechart.eps
% scoreshistbytens.eps
% scoreshistbytwos.eps
% scoreshistbyfives.eps
% scoreshistbytwenties.eps
% scoreshistbyfifties.eps
% scoresRFhistbytens.eps
% boxplot1.eps
% boxplot2.eps
% boxplot_exc.eps
% scatterrange.eps
% residual.eps
% scatter1.eps
% scatter2.eps
% scatter3.eps
% VennSampSpace.eps
% VennSampSpace1Circ.eps
% VennSampSpace1CircWDots.eps
% VennSampSpace1FilledCir.eps
% VennSamSpComp1FilledCir.eps
% VennSamSp2CirDisj.eps
% VnSamSp2CirNotDsj.eps
% VnSmSp2CrNtDsjDts.eps
% VnSmSp1FilledCirL.eps
% VnSmSp1FilledCirR.eps
% VnRacoon.eps
% VnSmSpFilledOvlp.eps
% VennSampSpace1Circ.4.eps
% VennSampSpace1Circ.4.6.eps
% VnSamSp2CirNotDsj.eps
% egVnDiagWProbs.eps
% probDistFunct.eps
% busWaitingTimes.eps
% dartHittingDistance.eps
% dartHittingBsE.eps
% dartMissingBsE.eps
% Norm17-3.eps
% Norm17-1.eps
% Norm17-5.eps
% Norm17-1_3_5.eps
% Norm0-1.eps
% 68_95_99.7.eps
% use68_95_99.7-1pic.eps
%
% You should be able to customize this, or extract parts from it for your own
% use fairly easily. I tried to follow good, clear, TeX style: for example,
% labels, for references to definitions, theorems, sections, etc., all
% follow a fairly uniform naming convention which should be obvious just by
% looking at a few.
%
%
%
\documentclass[12pt,letterpaper]{amsbook}
\usepackage[text={6in,9in},centering]{geometry}
\usepackage{wrapfig}
\usepackage{times}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathrsfs}
\usepackage{mathtools}
\usepackage{amsfonts}
\usepackage{hyperref}
\usepackage[hyphenbreaks]{breakurl}
\usepackage{graphicx}
\usepackage{float}
\usepackage{multirow}
\usepackage{afterpage}
\usepackage{environ}
\usepackage{xcolor}
\usepackage{pagecolor}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{examples}[theorem]{Examples}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{exercise}{Exercise}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{example}[theorem]{Example}
\newtheorem{fact}[theorem]{Fact}
\numberwithin{figure}{section}
\numberwithin{exercise}{chapter}
\numberwithin{section}{chapter}
\numberwithin{equation}{section}
\numberwithin{table}{subsection}
%macros for dealing with the emacs timestamp
\def\tsgutsb#1{\expandafter\tsgutsa #1\relax}
\def\tsgutsa#1 #2 #3 #4 #5 #6 #7\relax{#3 #4 #5 #6}
\def\timestamp{\tsgutsb{\tsval}}
\newenvironment{preface}{}{}
\newenvironment{releasenotes}{}{}
\newenvironment{discussion}{}{}
\newcommand{\Pp}{{\mathcal P}}
\newcommand{\muX}{{\mu_{\negthinspace X}}}
\newcommand{\muolX}{{\mu_{\negthinspace\overline{x}}}}
\newcommand{\sigmaX}{{\sigma_{\negthinspace X}}}
\newcommand{\sigmaolX}{{\sigma_{\negthinspace\overline{x}}}}
\renewcommand{\seename}{see}
\linespread{1.2}
\makeindex
\begin{document}
\frontmatter
\title{\ \vskip-3cm\includegraphics[width=10cm,clip]{ldlos_cover.eps}\\\huge\color{red} Lies, Damned Lies, or Statistics:\\
{\LARGE\it How to Tell the Truth with Statistics}}
\author{{\color{orange}{\Large\bf Jonathan A.~Poritz}}}
\address{\color{orange}\centerline{\bf Department of Mathematics and Physics\hphantom{XXXXX}}
\centerline{\bf Colorado State University, Pueblo}
\centerline{\bf 2200 Bonforte Blvd.}
\centerline{\bf Pueblo, CO 81001, USA}
\centerline{{\bf E-mail: jonathan@poritz.net}}
\centerline{{\bf Web: poritz.net/jonathan}}
\vskip2.5cm\hfill\tiny\timestamp}
\maketitle
\restorepagecolor
\chapter*{Release Notes}
This is a first draft of a free (as in speech, not as in beer,
\cite{stallman2002free}) (although it is free as in beer as well) textbook
for a one-semester, undergraduate statistics course. It was used for Math 156
at Colorado State University--Pueblo in the spring semester of 2017.
Thanks are hereby offered to the students in that class who offered
many useful suggestions and found numerous typos. In particular, Julie
Berogan has an eagle eye, and found a nearly uncountably infinite number of
mistakes, both small and large -- thank you!
\begin{wrapfigure}[3]{r}{0.15\textwidth}
\begin{center}
\includegraphics[height=.7cm,clip]{by-sa.eps}
\end{center}
\end{wrapfigure}
This work is released under a {\bf CC BY-SA 4.0} license, which allows anyone
who is interested to \textbf{share} (copy and redistribute in any medium or
format) and \textbf{adapt} (remix, transform, and build upon this work for any
purpose, even commercially). These rights cannot be revoked, so long as users
follow the license terms, which require \textbf{attribution} (giving
appropriate credit, linking to the license, and indicating if changes were
made) to be given and \textbf{share-alike} (if you remix or transform this
work, you must distribute your contributions under the same license as this
one) imposed. See {\tt creativecommons.org/licenses/by-sa/4.0} for all the
details.
This version: \timestamp.
\vskip2mm
\hfill\begin{tabular}{l}Jonathan A.~ Poritz\\
Spring Semester, 2017\\
Pueblo, CO, USA
\end{tabular}
\setcounter{tocdepth}{3}
\makeatletter
\def\l@subsection{\@tocline{2}{0pt}{2.5pc}{5pc}{}}
\makeatother
\tableofcontents
\begin{preface}
\chapter*{Preface}
Mark
Twain's\index{Twain, Mark [Samuel Clemens]}\index{Clemens, Samuel [Mark Twain]}
autobiography \cite{twain2010autobiography} modestly questions his own
reporting of the numbers of hours per day he sat down to write, and of the
number of words he wrote in that time, saying
\begin{quote}
\textit{Figures often beguile me, particularly when I have the arranging of
them myself; in which case the remark attributed to Disraeli would often apply
with justice and force:
\begin{center}
``\textbf{There are three kinds of lies: lies, damned lies, and
statistics.}''\index{lies}\index{lies, damned}
\end{center}}
\end{quote}
[emphasis added]
Here Twain gives credit for this pithy tripartite classification of lies to
Benjamin Disraeli\index{Disraeli, Benjamin}, who was Prime Minister of the
United Kingdom in 1868 (under Queen Victoria), although modern scholars find
no evidence that Disraeli was the actual originator of the phrase. But
whoever actually deserves credit for the phrase, it does seem that statistics
are often used to conceal the truth, rather than to reveal it. So much so,
for example, that the wonderful book {\bf How to Lie with
Statistics}\index{How to Lie with Statistics} \cite{huff1993how}, by Darrell
Huff\index{Huff, Darrell}, gives many, many examples of misused statistics,
and yet merely scratches the surface.
We contend, however, that statistics are not a type of lie, but rather, when
used carefully, are an {\it alternative} to lying. For this reason, we use
``or'' in the title of this book, where Twain/Disraeli used ``and,'' to
underline how we are thinking of statistics, correctly applied, as standing
in opposition to lies and damned lies.
But why use such a complicated method of telling the truth as statistics, rather
than, say, telling a good story or painting a moving picture? The answer, we
believe, is simply that there are many concrete, specific questions that humans
have about the world which are best answered by carefully collecting some data
and using a modest amount of mathematics and a fair bit of logic to analyze
them. The thing about the Scientific Method is that it just seems to work. So
why not learn how to use it?
Learning better techniques of critical thinking seems particularly important
at this moment of history when our politics in the United States (and
elsewhere) are so divisive, and different parties cannot agree about the most
basic facts. A lot of commentators from all parts of the political spectrum
have speculated about the impact of so-called
{\it fake news}\index{fake news}\index{news, fake} on the outcomes of recent
recent elections and other political debates. It is therefore the goal of
this book to help you learn {\bf How to Tell the Truth with Statistics} and,
therefore, how to tell when others are telling the truth ... or are faking
their ``news.''
\end{preface}
\mainmatter
\part{Descriptive Statistics}\label{part:DS}
The first instinct of the scientist should be to organize carefully a
question of interest, and to collect some data about this question. How to
collect good data is a real and important issue, but one we discuss later.
Let us instead assume for the moment that we have some data, good or bad, and
first consider what to do with them\footnote{The word ``data'' is really a
plural, corresponding to the singular ``datum.'' We will try to remember to
use plural forms when we talk about ``data,'' but there will be no penalty
for (purely grammatical) failure to do so.}. In particular, we want to
describe them, both graphically and with numbers that summarize some of their
features.
We will start by making some basic definitions of terminology -- words like
{\bf individual}, {\bf population}, {\bf variable}, {\bf mean}, {\bf median},
{\it etc.} -- which it will be important for the student to understand
carefully and completely. So let's briefly discuss what a definition
{\it is}, in mathematics.
Mathematical definitions\index{definition, in mathematics} should be perfectly
precise because they do not {\it describe} something which is observed out
there in the world, since such descriptive definitions might have fuzzy
edges. In biology, for example, whether a virus is considered ``alive''
could be subject to some debate: viruses have some of the characteristics of
life, but not others. This makes a mathematician nervous.
When we look at math, however, we should always know exactly which objects
satisfy some definition and which do not. For example, an {\it even
number}\index{even number, definition}
is a whole number which is two times some other whole number. We can always
tell whether some number $n$ is even, then, by simply checking if there is
some other number $k$ for which the arithmetic statement $n=2k$ is true: if
so, $n$ is even, if not, $n$ is not even. If you claim a number $n$ is even,
you need just state what is the corresponding $k$; if claim it is not even,
you have to somehow give a convincing, detailed explanation (dare we call it
a ``proof''\index{proof}) that such a $k$ simply does not exist.
So it is important to learn mathematical definitions carefully, to know what
the criteria are for a definition, to know examples that satisfy some
definition and other examples which do not.
Note, finally, that in statistics, since we are using mathematics in the real
world, there will be some terms (like {\bf individual} and {\bf population})
which will not be exclusively in the mathematical realm and will therefore
have less perfectly mathematical definitions. Nevertheless, students should
try to be as clear and precise as possible.
\vskip5mm
The material in this Part is naturally broken into two cases, depending upon
whether we measure a single thing about a collection of individuals or we make
several measurements. The first case is called {\bf one-variable
statistics}\index{one-variable statistics}, and will be our first
major topic. The second case could potentially go as far as
{\bf multi-variable statistics}\index{multi-variable statistics}, but we will
mostly talk about situations where we make {\it two} measurements, our second
major topic. In this case of {\bf bivariate
statistics}\index{bivariate statistics}, we will not only describe each
variable separately (both graphically and numerically), but we will also
describe their relationship, graphically and numerically as well.
\chapter{One-Variable Statistics: Basics}\label{chap:1VS}
\section{Terminology: Individuals/Population/Variables/Samples}
\label{sec:TIPVS}
Oddly enough, it is often a lack of clarity about {\it who} [or {\it what}]
{\it you are looking at} which makes a lie out of statistics. Here are the
terms, then, to keep straight:
\begin{definition}\index{individual in a statistical study}\index{population of a statistical study}
The units which are the objects of a statistical study are called the
{\bf individuals}\index{individual in a statistical study} in that study,
while the collection of all such individuals is called the
{\bf population}\index{population of a statistical study} of the study.
\end{definition}
Note that while the term ``individuals'' sounds like it is talking about
people, the individuals in a study could be things, even abstract things like
events.
\begin{example}\index{individual in a statistical study}\index{population of a statistical study}
\label{eg:votersind}
The individuals in a study about a democratic election might be
{\it the voters}\index{voters}. But if you are going to make an accurate
prediction of who will win the election, it is important to be more precise
about what exactly is the population of all of those individuals [voters]
that you intend to study, but it {\it all eligible voters}, {\it all
registered voters}, {\it the people who actually voted}, {\it etc.}
\end{example}
\begin{example}\index{individual in a statistical study}\index{population of a statistical study}
\label{eg:flipsind}
If you want to study if a coin is ``fair'' or not, you would flip it repeatedly.
The individuals would then be {\it flips of that coin}, and the population
might be something like {\it all the flips ever done in the past and all that
will every be done in the future}. These individuals are quite abstract, and
in fact it is impossible ever to get your hands on all of them (the ones in the
future, for example).
\end{example}
\begin{example}\index{individual in a statistical study}\index{population of a statistical study}
\label{eg:studentstakingtestsind}
Suppose we're interested in studying whether doing more homework helps students
do better in their studies. So shouldn't the individuals be the students?
Well, which students? How about we look only at college students. Which
college students? OK, how about students at 4-year colleges and universities
in the United States, over the last five years -- after all, things might be
different in other countries and other historical periods.
Wait, a particular student might sometimes do a lot of homework and sometimes
do very little. And what exactly does ``do better in their studies'' mean? So
maybe we should look at each student in each class they take, then we can look
at the homework they did for that class and the success they had in it.
Therefore, the individuals in this study would be {\it individual experiences
that students in US 4-year colleges and universities had in the last five
years}, and population of the study would essentially be the collection of all
the names on all class rosters of courses in the last five years at all US
4-year colleges and universities.
\end{example}
When doing an actual scientific study, we are usually not interested so much
in the individuals themselves, but rather in
\begin{definition}\index{variable}
A {\bf variable} in a statistical study is the answer of a question the
researcher is asking about each individual. There are two types:
\begin{itemize}
\item A {\bf categorical variable}\index{variable!categorical}\index{categorical variable}
is one whose
values have a finite number of possibilities.
\item A {\bf quantitative variable}\index{quantitative variable}\index{variable!quantitative}\index{quantitative variable} is one whose
values are numbers (so, potentially an infinite number of possibilities).
\end{itemize}
\end{definition}
The variable is something which (as the name says) {\it varies}, in the sense
that it can have a different value for each individual in the population
(although that is not necessary).
\begin{example}\index{variable!categorical}\index{categorical variable}
\label{eg:votersvar}
In Example~\ref{eg:votersind}, the variable most likely would be {\it who they
voted for}, a categorical variable with only possible values ``Mickey Mouse''
or ``Daffy Duck'' (or whoever the names on the ballot were).
\end{example}
\begin{example}\index{variable!categorical}\index{categorical variable}
\label{eg:flipsvar}
In Example~\ref{eg:flipsind}, the variable most likely would be {\it what face
of the coin was facing up after the flip}, a categorical variable with values
``heads'' and ``tails.''
\end{example}
\begin{example}\label{eg:studentstakingtestsvar}
There are several variables we might use in
Example~\ref{eg:studentstakingtestsind}. One might be {\it how many homework
problems did the student do in that course}. Another could be {\it how many
hours total did the student spend doing homework over that whole semester, for
that course}. Both of those would be quantitative
variables\index{quantitative variable}\index{variable!quantitative}\index{quantitative variable}.
A categorical variable\index{variable!categorical}\index{categorical variable} for the same
population would be {\it what letter grade did the student get in the course},
which has possible values {\bf A}, {\bf A-}, {\bf B+}, \dots, {\bf D-}, {\bf F}.
\end{example}
In many [most?] interesting studies, the population is too large for it to be
practical to go observe the values of some interesting variable. Sometimes it
is not just impractical, but actually impossible -- think of the example we gave
of all the flips of the coin, even in the ones in the future. So instead, we
often work with
\begin{definition}\index{sample}
A {\bf sample} is a subset of a population under study.
\end{definition}
Often we use the variable
$N$\index{population size, $N$}\index{10760@$N$, population size} to indicate the
size of a whole population and the variable
$n$\index{sample size, $n$}\index{10770@$n$, sample size} for the size of a sample;
as we have said, usually $n set xlabel "Face showing on coin"
% gnuplot> set ylabel "Frequency"
% gnuplot> unset key
% gnuplot> set yrange [0:10]
% gnuplot> set style fill solid
% gnuplot> set boxwidth .5
% gnuplot> plot "flipsfreq.data" using 1:3:xtic(2) with boxes
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as ``flipsfreq.eps''
\end{example}
\subsection{Bar Charts II: Relative Frequency Charts}\label{ssec:BCIIRFC}
There is a variant of the above kind of bar chart which actually looks nearly
the same but changes the labels on the $y$-axis. That is, instead of making
the height of each bar be how many times each categorical value occurred, we
could make it be {\it what fraction of the sample had that categorical
value} -- the {\bf relative frequency}\label{def:relfreq}\index{relative frequency}\index{frequency!relative}. This fraction is often displayed as a
percentage.
\begin{example}\label{eg:flipsbarchartrelfreq}
\index{bar chart!relative frequency}
The relative frequency version of the above bar chart in
Example~\ref{eg:flipsbarchartfreq} would look like
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{flipsrelfreq.eps}
\end{center}
% this chart was made with datafile ``flipsrelfreq.data'' containing the 2 lines
% 0 H .4
% 1 T .6
%then gnuplot was run with commands:
% gnuplot> set xlabel "Face showing on coin"
% gnuplot> set ylabel "Relative frequency"
% gnuplot> unset key
% gnuplot> set yrange [0:1]
% gnuplot> set style fill solid
% gnuplot> set boxwidth .5
% gnuplot> plot "flipsrelfreq.data" using 1:3:xtic(2) with boxes
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as ``flipsrelfreq.eps''
\end{example}
\subsection{Bar Charts III: Cautions}\label{ssec:BCIIIC}
Notice that with bar charts (of either frequency or relative frequency) the
variable values along the $x$-axis {\it can appear in any order whatsoever}.
This means that any conclusion you draw from looking at the bar chart must
not depend upon that order. For example, it would be foolish to say that the
graph in the above Example~\ref{eg:flipsbarchartfreq} ``shows and increasing
trend,'' since it would make just as much sense to put the bars in the other
order and then ``show a decreasing trend'' -- both are meaningless.
For relative frequency bar charts, in particular, note that the total of
the heights of all the bars must be $1$ (or $100$\%). If it is more, something
is weird; if it is less, some data has been lost.
Finally, it makes sense for both kinds of bar charts for the $y$-axis to
run from the logical minimum to maximum. For frequency charts, this means it
should go from $0$ to $n$ (the sample size). For relative frequency charts,
it should go from $0$ to $1$ (or $100$\%). Skimping on how much of this
appropriate $y$-axis is used is a common trick to lie with statistics.
\begin{example}\label{eg:flipsbarchartrelfreqbadyaxis}
\index{bar chart!relative frequency}
The coin we looked at in Example~\ref{eg:flipsbarchartfreq} and
Example~\ref{eg:flipsbarchartrelfreq} could well be a fair coin -- it didn't
show exactly half heads and half tails, but it was pretty close. Someone who
was trying to claim, deceptively, that the coin was not fair might have shown
only a portion of the $y$ axis, as
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{flipsrelfreqcheat.eps}
\end{center}
% this chart was made with datafile ``flipsrelfreq.data'' containing the 2 lines
% 0 H .4
% 1 T .6
%then gnuplot was run with commands:
% gnuplot> set xlabel "Face showing on coin"
% gnuplot> set ylabel "Relative frequency"
% gnuplot> unset key
% gnuplot> set yrange [.3:.6]
% gnuplot> set style fill solid
% gnuplot> set boxwidth .5
% gnuplot> plot "flipsrelfreq.data" using 1:3:xtic(2) with boxes
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as ``flipsrelfreqcheat.eps''
This is actually, in a strictly technical sense, a correct graph. But, looking
at it, one might conclude that T seems to occur more than twice as often as
H, so the coin is probably not fair ... until a careful examination of the
$y$-axis shows that even though the bar for T is more than twice as high as the
bar for H, that is an artifact of how much of the $y$-axis is being shown.
\end{example}
\vskip5mm
In summary, bar charts actually don't have all that much use in sophisticated
statistics, but are extremely common in the popular press (and on web sites
and so on).
\subsection{Pie Charts}\label{ssec:PC}
Another way to make a picture with categorical data is to use the fractions
from a relative frequency bar chart, but not for the heights of bars, instead
for the sizes of wedges of a pie.
\begin{example}\label{eg:flipspiechart}\index{pie chart}
Here's a pie chart with the relative frequency data from
Example~\ref{eg:flipsbarchartrelfreq}.
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{flipspiechart.eps}
\end{center}
% this chart was made in gnuplot as follows
% gnuplot> set size square
% gnuplot> set style fill solid 1.0 border -1
% gnuplot> set object 1 circle at screen 0.5,0.5 size screen 0.45 arc [0 :144 ] fillcolor rgb "red" front
% gnuplot> set object 2 circle at screen 0.5,0.5 size screen 0.45 arc [144 :360] fillcolor rgb "blue" front
% gnuplot> unset border
% gnuplot> unset tics
% gnuplot> unset key
% gnuplot> plot x with lines lc rgb "#ffffff"
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as ``flipspiechart.eps''
\end{example}
Pie charts are widely used, but actually they are almost never a good choice.
In fact, do an Internet search for the phrase ``pie charts are bad'' and there
will be nearly 3000 hits. Many of the arguments are quite insightful.
When you see a pie chart, it is either an attempt (misguided, though) by
someone to be folksy and friendly, or it is a sign that the author is quite
unsophisticated with data visualization, or, worst of all, it might be a sign
that the author is trying to use mathematical methods in a deceptive way.
In addition, all of the cautions we mentioned above for bar charts of
categorical data apply, mostly in exactly the same way, for pie charts.
\vfill
\pagebreak
\section[Visual Representation of Data, II]{Visual Representation of Data, II: Quantitative Variables}
\label{sec:VRoDIIQV}
Now suppose we have a population and {\it quantitative}
variable\index{quantitative variable}\index{variable!quantitative}\index{quantitative variable} in which we
are interested. We get a sample, which could be large or small, and look at
the values of the our variable for the individuals in that sample. There are
two ways we tend to make pictures of datasets like this: {\it stem-and-leaf
plots} and {\it histograms}.
\subsection{Stem-and-leaf Plots}\label{ssec:SalP}
One somewhat old-fashioned way to handle a modest amount of quantitative data
produces something between simply a list of all the data values and a graph.
It's not a bad technique to know about in case one has to write down a dataset
by hand, but very tedious -- and quite unnecessary, if one uses modern
electronic tools instead -- if the dataset has more than a couple dozen values.
The easiest case of this technique is where the data are all whole numbers in
the range $0-99$. In that case, one can take off the tens place of each
number -- call it the {\bf stem}\index{stem, in stemplot} -- and put it on the
left side of a vertical bar, and then line up all the ones places -- each is a
{\bf leaf}\index{leaf, in stemplot} -- to the right of that stem. The whole
thing is called a {\bf stem-and-leaf plot}\index{stem-and-leaf plot, stemplot}
or, sometimes, just a {\bf stemplot}\index{stem-and-leaf plot, stemplot}.
It's important not to skip any stems which are in the middle of the dataset,
even if there are no corresponding leaves. It is also a good idea to allow
repeated leaves, if there are repeated numbers in the dataset, so that the
length of the row of leaves will give a good representation of how much
data is in that general group of data values.
\begin{example}\label{eg:stemandleafplot}\index{stem-and-leaf plot, stemplot}
Here is a list of the scores of 30 students on a statistics test:
$$
\begin{matrix}
86 & 80 & 25 & 77 & 73 & 76 & 88 & 90 & 69 & 93\\
90 & 83 & 70 & 73 & 73 & 70 & 90 & 83 & 71 & 95\\
40 & 58 & 68 & 69 & 100 & 78 & 87 & 25 & 92 & 74
\end{matrix}
$$
As we said, using the tens place (and the hundreds place as well, for the
data value $100$) as the stem and the ones place as the leaf, we get
\begin{table*}[htbp]\label{tab:stemplot1}
\centering
\caption{Stem-and-leaf plot of students' scores, Key: $1 | 7 = 17$}
\begin{tabular}{r|l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}}
Stem & \multicolumn{10}{l}{Leaf}\\
10&0& & & & & & & & & \\
9&0&0&0&2&3&5& & & & \\
8&0&3&3&6&7&8& & & & \\
7&0&0&1&3&3&3&4&6&7&8\\
6&8&9&9& & & & & & & \\
5&8& & & & & & & & & \\
4&0& & & & & & & & & \\
3& & & & & & & & & & \\
2&5&5& & & & & & & & \\
\end{tabular}
\end{table*}
\end{example}
One nice feature stem-and-leaf plots have is that {\it they contain
all of the data values}, they do not lose anything (unlike our next
visualization method, for example).
\subsection{[Frequency] Histograms}\label{ssec:FHistograms}
The most important visual representation of quantitative data is a
{\bf histogram}. Histograms actually look a lot like a stem-and-leaf plot,
except turned on its side and with the row of numbers turned into a vertical
bar, like a bar graph. The height of each of these bars would be how many
Another way of saying that is that we would be making bars whose heights were
determined by how many scores were in each group of ten. Note there is still
a question of into which bar a value right on the edge would count: {\it e.g.,}
does the data value $50$ count in the bar to the left of that number, or the
bar to the right? It doesn't actually matter which side, but it is important
to state which choice is being made.
\begin{example}\label{eg:scoreshistbytens}\index{histogram}
Continuing with the score data in Example~\ref{eg:stemandleafplot} and
putting all data values $x$ satisfying $20\le x<30$ in the first bar,
values $x$ satisfying $30\le x<40$ in the second,
values $x$ satisfying $40\le x<50$ in the second, {\it etc.} -- that is,
put data values on the edges in the bar to the right -- we get the
figure
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{scoreshistbytens.eps}
\end{center}
% this chart was made with datafile ``scores_hist_tens.data'' containing the
% lines
% 25 2
% 45 1
% 55 1
% 65 3
% 75 10
% 85 6
% 95 6
% 105 1
%%then gnuplot was run with commands:
% gnuplot> set xlabel "Score"
% gnuplot> set ylabel "Frequency"
% gnuplot> set title "Scores Histogram with Binwidth 10"
% gnuplot> unset key
% gnuplot> set yrange [0:12]
% gnuplot> set boxwidth 10
% gnuplot> plot "scores_hist_tens.data" using 1:2 with boxes
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as ``scoreshistbytens.eps''
\end{example}
Actually, there is no reason that the bars always have to be ten units wide:
it is important that they are all the same size and that how they handle the
edge cases (whether the left or right bar gets a data value on edge), but
they could be any size. We call the successive ranges of the $x$ coordinates
which get put together for each bar the called
{\bf bins}\index{bins, in a histogram} or
{\bf classes}\index{classes, in a histogram}, and it is up to the statistician
to chose whichever bins -- where they start and how wide they are -- shows the
data best.
Typically, the smaller the bin size, the more variation (precision) can be
seen in the bars ... but sometimes there is so much variation that the result
seems to have a lot of random jumps up and down, like static on the radio.
On the other hand, using a large bin size makes the picture smoother ... but
sometimes, it is so smooth that very little information is left. Some of this
is shown in the following
\begin{example}\label{eg:scoreshistvariousbins}\index{histogram}
Continuing with the score data in Example~\ref{eg:stemandleafplot} and now
using the bins
with $x$ satisfying $10\le x<12$, then $12\le x<14$, {\it etc.},
we get the histogram with bins of width 2:
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{scoreshistbytwos.eps}
\end{center}
If we use the bins with $x$ satisfying $10\le x<15$, then $15\le x<20$,
{\it etc.}, we get the histogram with bins of width 5:
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{scoreshistbyfives.eps}
\end{center}
If we use the bins with $x$ satisfying $20\le x<40$, then $40\le x<60$,
{\it etc.}, we get the histogram with bins of width 20:
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{scoreshistbytwenties.eps}
\end{center}
Finally, if we use the bins with $x$ satisfying $0\le x<50$, then
$50\le x<100$, and then $100\le x<150$, we get the histogram with bins of
width 50:
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{scoreshistbyfifties.eps}
\end{center}
\end{example}
\subsection{[Relative Frequency] Histograms}\label{ssec:RFHistograms}
Just as we could have bar charts with absolute (\S\ref{ssec:BCIFC}) or
relative (\S\ref{ssec:BCIIRFC}) frequencies, we can do the same for
histograms.\index{histogram!relative frequency} Above, in
\S\ref{ssec:FHistograms}, we made absolute frequency histograms. If, instead,
we divide each of the counts used to determine the heights of the bars by the
total sample size, we will get fractions or percents -- {\it relative}
frequencies. We should then change the label on the $y$-axis and the
tick-marks numbers on the $y$-axis, but otherwise the graph will look exactly
the same (as it did with relative frequency bar charts compared with absolute
frequency bar chars).
\begin{example}\label{eg:scoresRFhistbytens}\index{histogram!relative frequency}
Let's make the relative frequency histogram corresponding to the absolute
frequency histogram in Example~\ref{eg:scoreshistbytens}, based on the data
from Example~\ref{eg:stemandleafplot} -- all we have to do is change the
numbers used to make heights of the bars in the graph by dividing them by
the sample size, 30, and then also change the $y$-axis label and tick mark
numbers.
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{scoresRFhistbytens.eps}
\end{center}
% this chart was made with datafile ``scores_hist_tens.data'' containing the
% lines
% 25 2
% 45 1
% 55 1
% 65 3
% 75 10
% 85 6
% 95 6
% 105 1
%%then gnuplot was run with commands:
% gnuplot> set xlabel "Score"
% gnuplot> set ylabel "Relative Frequency"
% gnuplot> set title "Scores Relative Frequency Histogram with Binwidth 10"
% gnuplot> unset key
% gnuplot> set yrange [0:.4]
% gnuplot> set boxwidth 10
% gnuplot> plot "scores_hist_tens.data" using 1:($2/30) with boxes
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as ``scoresRFhistbytens.eps''
\end{example}
\subsection{How to Talk About Histograms}\label{ssec:HtTAH}
Histograms of course tell us what the data values are -- the location along
the $x$ value of a bar is the value of the variable -- and how many of them
have each particular value -- the height of the bar tells how many data values
are in that bin. This is also given a technical name
\begin{definition}\label{def:distribution}\index{distribution}
Given a variable defined on a population, or at least on a sample, the
{\bf distribution} of that variable is a list of all the values the variable
actually takes on and how many times it takes on these values.
\end{definition}
The reason we like the visual version of a distribution, its histogram, is
that our visual intuition can then help us answer general, qualitative
questions about what those data must be telling us. The first questions we
usually want to answer quickly about the data are
\begin{itemize}
\item What is the {\it shape}\index{shape!histogram} of the histogram?
\item Where is its {\it center}\index{center of a histogram, dataset, or distribution}?
\item How much {\it variability}\index{variability|see{spread of a histogram, dataset, or distribution}} [also called
{\it spread}\index{spread of a histogram, dataset, or distribution}] does it show?
\end{itemize}
When we talk about the general shape of a histogram, we often use the terms
\begin{definition}\label{def:symmskew}\index{symmetric histogram, dataset, or distribution}\index{skewed histogram, dataset, or distribution}\index{unimodal histogram, dataset, or distribution}\index{multimodal histogram, dataset, or distribution}
A histogram is {\bf symmetric} if the left half is (approximately) the mirror
image of the right half.
We say a histogram is {\bf skewed left} if the tail on the left side is longer
than on the right. In other words, left skew is when the left half of the
histogram -- half in the sense that the total of the bars in this left part
is half of the size of the dataset -- extends farther to the left than the
right does to the right. Conversely, the histogram is {\bf skewed right} if
the right half extends farther to the right than the left does to the left.
If the shape of the histogram has one significant peak, then we say it is
{\bf unimodal}, while if it has several such, we say it is {\bf multimodal}.
\end{definition}
It is often easy to point to where the center of a distribution {\it looks
like} it lies, but it is hard to be precise. It is particularly difficult
if the histogram is ``noisy,'' maybe multimodal. Similarly, looking at a
histogram, it is often easy to say it is ``quite spread out'' or ``very
concentrated in the center,'' but it is then hard to go beyond this general
sense.
Precision in our discussion of the center and spread of a dataset will
only be possible in the next section, when we work with numerical measures
of these features.
\vfill
\pagebreak
\section[Numerical Descriptions of Data, I]{Numerical Descriptions of Data, I:
Measures of the Center}
\label{sec:NDoDIMotC}
Oddly enough, there are several measures of central tendency, as ways to
define the middle of a dataset are called. There is different work to be
done to calculate each of them, and they have different uses, strengths, and
weaknesses.
For this whole section we will assume we have collected $n$ numerical values,
the values of our quantitative
variable\index{quantitative variable}\index{variable!quantitative}\index{quantitative variable} for the
sample we were able to study. When we write formul{\ae} with these values,
we can't give them variable names that look like $a, b, c, \dots$, because we
don't know where to stop (and what would we do if $n$ were more than 26?).
Instead, we'll use the variables $x_1, x_2, \dots, x_n$ to represent the data
values.
One more very convenient bit of notation, once we have started writing an
unknown number ($n$) of numbers $x_1, x_2, \dots, x_n$, is a way of writing
their sum:
\begin{definition}\label{def:summation}\index{summation notation, $\Sigma$}
\index{11300@$\Sigma$, summation notation}\index{pig, yellow}
If we have $n$ numbers which we write $x_1, \dots, x_n$, then we use the
shorthand {\bf summation notation} $\sum x_i$ to represent the sum
$\sum x_i = x_1 + \dots + x_n$. \footnote{Sometimes you will see this written
instead $\sum_{i=1}^n x_i$ . Think of the ``$\sum_{i=1}^n{}$''
as a little computer program which with $i=1$, increases it one step at a time
until it gets all the way to $i=n$, and adds up whatever is to the right. So,
for example, $\sum_{i=1}^3 2i$ would be $(2*1)+(2*2)+(2*3)$, and so has the value $12$.}
\end{definition}
\begin{example}\label{eg:subscriptssums}
If our dataset were $\{1, 2, 17, -3.1415, 3/4\}$, then $n$ would be 5 and the
variables $x_1, \dots, x_5$ would be defined with values $x_1=1$, $x_2=2$,
$x_3=17$, $x_4=-3.1415$, and $x_5=3/4$.
In addition\footnote{no pun intended}, we would have $\sum x_i = x_1+x_2+x_3+x_4+x_5=1+2+17-3.1415+ 3/4=17.6085$.
\end{example}
\subsection{Mode}\label{ssec:Mode}
Let's first discuss probably the simplest measure of central tendency, and in
fact one which was foreshadowed by terms like ``unimodal.''
\begin{definition}\index{mode}\label{def:mode}
A {\bf mode} of a dataset $x_1, \dots, x_n$ of $n$ numbers is one of the
values $x_i$ which occurs at least as often in the dataset as any other value.
\end{definition}
It would be nice to say this in a simpler way, something like ``the mode is
the value which occurs the most often in the dataset,'' but there may not be
a single such number.
\begin{example}\label{eg:mode}
Continuing with the data from Example~\ref{eg:stemandleafplot}, it is easy to
see, looking at the stem-and-leaf plot, that both 73 and 90 are modes.
Note that in some of the histograms we made using these data and different bin
widths, the bins containing 73 and 90 were of the same height, while in others
they were of different heights. This is an example of how it can be quite hard
to see on a histogram where the mode is... or where the mode{\bf s are}.
\end{example}
\subsection{Mean}\label{ssec:Mean}
The next measure of central tendency, and certainly the one heard most often
in the press, is simply the average. However, in statistics, this is given a
different name.
\begin{definition}\label{def:mean}\index{mean}\index{average!see: {mean}}
The {\bf mean} of a dataset $x_1, \dots, x_n$ of $n$ numbers is given by the
formula $\left(\sum x_i\right)/n$.
If the data come from a sample, we use the notation
$\overline{x}$\index{11600@$\overline{x}$, sample mean} for the
{\bf sample mean}\index{sample mean, $\overline{x}$}\index{mean!sample}.
If $\{x_1, \dots, x_n\}$ is all of the data from an entire population, we use
the notation $\muX$ [this is the Greek letter ``mu,'' pronounced ``mew,'' to
rhyme with ``new.''] for the
{\bf population mean}\index{population mean, $\muX$}\index{mean!population}\index{10750@$\muX$, population mean}.
\end{definition}
\begin{example}\label{eg:mean1}
Since we've already computed the sum of the data in
Example~\ref{eg:subscriptssums} to be $17.6085$ and there were $5$ values in
the dataset, the mean is $\overline{x}=17.6085/5 = 3.5217$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}.
\end{example}
\begin{example}\label{eg:mean2}
Again using the data from Example~\ref{eg:stemandleafplot}, we can calculate
the mean $\overline{x}=\left(\sum x_i\right)/n =2246/30=74.8667$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}.
\end{example}
Notice that the mean in the two examples above was not one of the data values.
This is true quite often. What that means is that the phrase ``the average
{\it whatever},'' as in ``the average American family has $X$'' or ``the
average student does $Y$,'' is not talking about any particular family, and
we should not expect any particular family or student to have or do that
thing. Someone with a statistical education should mentally edit every
phrase like that they hear to be instead something like ``the mean of the
variable $X$ on the population of all American families is ...,'' or ``the
mean of the variable $Y$ on the population of all students is ...,'' or
whatever.
\subsection{Median}\label{ssec:Median}
Our third measure of central tendency is not the result of arithmetic, but
instead of putting the data values in increasing order.
\begin{definition}\label{def:median}\index{median}
Imagine that we have put the values of a dataset $\{x_1, \dots, x_n\}$ of $n$
numbers in increasing (or at least non-decreasing) order, so that
$x_1\le x_2\le \dots \le x_n$. Then if $n$ is odd, the {\bf median} of
the dataset is the middle value, $x_{(n+1)/2}$, while if $n$ is even,
the median is the mean of the two middle numbers,
$\frac{x_{n/2}+x_{(n/2)+1}}{2}$.
\end{definition}
\begin{example}\label{eg:median1}
Working with the data in Example~\ref{eg:subscriptssums}, we must first put
them in order, as $\{-3.1415, 3/4, 1, 2, 17\}$, so the median of this
dataset is the middle value, $1$.
\end{example}
\begin{example}\label{eg:median2}
Now let us find the median of the data from Example~\ref{eg:stemandleafplot}.
Fortunately, in that example, we made a stem-and-leaf plot and even put the
leaves in order, so that starting at the bottom and going along the rows of
leaves and then up to the next row, will give us all the values in order!
Since there are 30 values, we count up to the $15^{th}$ and $16^{th}$ values,
being 76 and 77, and from this we find that the median of the dataset is
$\frac{76+77}{2}=76.5$.
\end{example}
\subsection{Strengths and Weaknesses of These Measures of Central Tendency}
\label{ssec:SaWoTMoCT}
The weakest of the three measures above is the mode\index{mode}. Yes, it is
nice to know which value happened most often in a dataset (or which values all
happened equally often and more often then all other values). But this often
does not necessarily tell us much about the over-all structure of the data.
\begin{example}\label{eg:modeweak}\index{mode}
Suppose we had the data
$$
\begin{matrix}
86 & 80 & 25 & 77 & 73 & 76 & 100 & 90 & 67 & 93\\
94 & 83 & 72 & 75 & 79 & 70 & 91 & 82 & 71 & 95\\
40 & 58 & 68 & 69 & 100 & 78 & 87 & 25 & 92 & 74
\end{matrix}
$$
with corresponding stem-and-leaf plot
\begin{table*}[htbp]
\centering
\begin{tabular}{r|l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}l@{\hspace{4 pt}}}
Stem & \multicolumn{10}{l}{Leaf}\\
10&0& & & & & & & & & \\
9&0&1&2&3&4&5& & & & \\
8&0&2&3&6&7&8& & & & \\
7&0&1&2&3&4&5&6&7&8&9\\
6&7&8&9& & & & & & & \\
5&8& & & & & & & & & \\
4&0& & & & & & & & & \\
3& & & & & & & & & & \\
2&5&5& & & & & & & & \\
\end{tabular}
\end{table*}
\noindent
This would have a histogram with bins of width 10 that looks exactly like the
one in Example~\ref{eg:scoreshistbytens} -- so the center of the histogram
would seem, visually, still to be around the bar over the 80s -- but now
there is a unique mode of 25.
\end{example}
What this example shows is that a small change in some of the data values,
small enough not to change the histogram at all, can change the mode(s)
drastically. It also shows that the location of the mode says very little
about the data in general or its shape, the mode is based entirely on a
possibly accidental coincidence of some values in the dataset, no matter if
those values are in the ``center'' of the histogram or not.
The mean\index{mean} has a similar problem: a small change in the data, in the
sense of adding only one new data value, but one which is very far away from
the others, can change the mean quite a bit. Here is an example.
\begin{example}\label{eg:mean3}
Suppose we take the data from Example~\ref{eg:stemandleafplot} but change only
one value -- such as by changing the 100 to a 1000, perhaps by a simple typo
of the data entry. Then if we calculate the mean, we get
$\overline{x}=\left(\sum x_i\right)/n =3146/30=104.8667$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}, which is quite
different from the mean of original dataset.
\end{example}
A data value which seems to be quite different from all (or the great majority
of) the rest is called an {\it outlier}\index{outlier}\footnote{This is a very
informal definition of an outlier. Below we will have an extremely precise
one.} What we have just seen is that
{\bf the mean is very sensitive to outliers}\index{sensitive to outliers}.
This is a serious defect, although otherwise it is easy to compute, to work
with, and to prove theorems about.
Finally, the median\index{median} is somewhat tedious to compute, because
the first step is to put all the data values in order, which can be very
time-consuming. But, once that is done, throwing in an outlier tends to move
the median only a little bit. Here is an example.
\begin{example}\label{eg:median3}
If we do as in Example~\ref{eg:mean3} and change the data value of 100 in the
dataset of Example~\ref{eg:stemandleafplot} to 1000, but leave all of the other
data values unchanged, it does not change the median at all since the 1000 is
the new largest value, and that does not change the two middle values at all.
If instead we take the data of Example~\ref{eg:stemandleafplot} and simply add
another value, 1000, without taking away the 100, that does change the media:
there are now an odd number of data values, so the median is the middle one
after they are put in order, which is 78. So the median has changed by only
half a point, from 77.5 to 78. And his would even be true if the value we
were adding to the dataset were 1000000 and not just 1000!
\end{example}
In other words,
{\bf the median is very insensitive to outliers}\index{insensitive to outliers}.
Since, in practice, it is very easy for datasets to have a few random, bad
values (typos, mechanical errors, {\it etc.}), which are often outliers, it
is usually smarter to use the median than the mean.
As one final point, note that as we mentioned in \S\ref{ssec:Mean}, the
word ``average,'' the unsophisticated version of ``mean,'' is often incorrectly
used as a modifier of the individuals in some population being studied (as in
``the average American ...''), rather than as a modifier of the variable in
the study (``the average income...''), indicating a fundamental misunderstanding
of what the mean {\it means}. If you look a little harder at this
misunderstanding, though, perhaps it is based on the idea that we are looking
for the center, the ``typical'' value of the variable.
The mode might seem like a good way -- it's the most frequently occurring
value. But we have seen how that is somewhat flawed.
The mean might also seem like a good way -- it's the ``average,'' literally.
But we've also seen problems with the mean.
In fact, the median is probably closest to the intuitive idea of ``the center
of the data.'' It is, after all, a value with the property that both above
and below that value lie half of the data values.
One last example to underline this idea:
\begin{example}\label{eg:meanmedianincome}\index{mean}\index{median}
\index{income distribution}\index{Great Recession}
The period of economic difficulty for world markets in the late 2000s and early
2010s is sometimes called the {\bf Great Recession}. Suppose a politician says
that we have come out of that time of troubles, and gives as proof the fact
that the average family income has increased from the low value it had during
the Great Recession back to the values it had before then, and perhaps is even
higher than it was in 2005.
It is possible that in fact people are better off, as the increase in this
average -- mean -- seems to imply. But it is also possible that while the mean
income has gone up, the {\it median} income is still low. This would happen
if the histogram of incomes recently still has most of the tall bars down
where the variable (family income) is low, but has a few, very high outliers.
In short, if the super-rich have gotten even super-richer, that will make the
mean (average) go up, even if most of the population has experienced stagnant
or decreasing wages -- but the median will tell what is happening to most of
the population.
So when a politician uses the evidence of the average (mean) as suggested here,
it is possible they are trying to hide from the pubic the reality of what is
happening to the rich and the not-so-rich. It is also possible that this
politician is simply poorly educated in statistics and doesn't realize what is
going on. You be the judge ... but pay attention so you know what to ask about.
\end{example}
The last thing we need to say about the strengths and weaknesses of our
different measures of central tendency is a way to use the weaknesses of the
mean and median to our advantage. That is, since the mean is sensitive to
outliers\index{sensitive to outliers}, and pulled in the direction of those outliers, while the median is
not, we can use the difference between the two to tell us which way a histogram
is skewed.
\begin{fact}\label{fac:skewdef}\index{mean}\index{median}\index{skewed histogram, dataset, or distribution}
If the mean of a dataset is larger than the median, then histograms of that
dataset will be right-skewed\index{right-skewed histogram, dataset, or distribution}\index{skewed histogram, dataset, or distribution!right}. Similarly, if the mean is less than the median,
histograms will be left-skewed\index{left-skewed histogram, dataset, or distribution}\index{skewed histogram, dataset, or distribution!left}.
\end{fact}
\vfill
\pagebreak
\section[Numerical Descriptions of Data, II]{Numerical Descriptions of Data, II:
Measures of Spread}
\label{sec:NDoDIMoS}
\subsection{Range}\label{ssec:Range}
The simplest -- and least useful -- measure of the spread\index{spread of a histogram, dataset, or distribution} of
some data is literally how much space on the $x$-axis the histogram takes up.
To define this, first a bit of convenient notation:
\begin{definition}\label{def:xminxmax}
\index{11800@$x_{min}$, minimum value in dataset}
\index{11700@$x_{max}$, maximum value in dataset}
Suppose $x_1, \dots, x_n$ is some quantitative dataset. We shall write
$x_{min}$ for the smallest and $x_{max}$ for the largest values in the dataset.
\end{definition}
With this, we can define our first measure of spread\index{spread of a histogram, dataset, or distribution}
\begin{definition}\label{def:range}\index{range}
Suppose $x_1, \dots, x_n$ is some quantitative dataset. The {\bf range} of
this data is the number $x_{max}-x_{min}$.
\end{definition}
\begin{example}\label{eg:spread1}
Using again the statistics test scores data from
Example~\ref{eg:stemandleafplot}, we can read off from the stem-and-leaf plot
that $x_{min}=25$ and $x_{max}=100$, so the range is $75(=100-25)$.
\end{example}
\begin{example}\label{eg:spread2}
Working now with the made-up data in Example~\ref{eg:subscriptssums}, which
was put into increasing order in Example~\ref{eg:median1}, we can see that
$x_{min}=-3.1415$ and $x_{max}=17$, so the range is $20.1415(=17-(-3.1415))$.
\end{example}
The thing to notice here is that since the idea of outliers is that they are
outside of the normal behavior of the dataset, if there are any outliers they
will definitely be what value gets called $x_{min}$ or $x_{max}$ (or both). So
{\bf the range is supremely sensitive to outliers}\index{sensitive to outliers}: if there are any outliers,
the range will be determined exactly by them, and not by what the typical data
is doing.
\subsection{Quartiles and the $IQR$}\label{ssec:QuartilesIQR}
Let's try to find a substitute for the range which is not so sensitive to
outliers. We want to see how far apart not the maximum and minimum of the
whole dataset are, but instead how far apart are the typical larger
values in the dataset and the typical smaller values. How can we measure
these typical larger and smaller? One way is to define these in terms of the
typical -- central -- value of the upper half of the data and the typical
value of the lower half of the data. Here is the definition we shall use for
that concept:
\begin{definition}\label{def:quartile}\index{quartile}\index{upper half data}
\index{lower half data}
Imagine that we have put the values of a dataset $\{x_1, \dots, x_n\}$ of $n$
numbers in increasing (or at least non-decreasing) order, so that $x_1\le
x_2\le \dots \le x_n$. If $n$ is odd, call the
{\bf lower half data}\index{lower half data} all the values
$\{x_1, \dots, x_{(n-1)/2}\}$ and the {\bf upper half data}\index{upper half data}
all the values $\{x_{(n+3)/2}, \dots, x_n\}$; if $n$ is even, the
{\bf lower half data} will be the values $\{x_1, \dots, x_{n/2}\}$ and the
{\bf upper half data} all the values $\{x_{(n/2)+1}, \dots, x_n\}$.
Then the {\bf first quartile}\index{first quartile}\index{quartile}, written
$Q_1$\index{10800@$Q_1$, first quartile}, is the median of the lower half data,
and the {\bf third quartile}\index{third quartile}\index{quartile}, written
$Q_3$\index{10900@$Q_3$, third quartile}, is the median of the upper half data.
\end{definition}
Note that the first quartile is halfway through the lower half of the data.
In other words, it is a value such that one quarter of the data is smaller.
Similarly, the third quartile is halfway through the upper half of the data,
so it is a value such that three quarters of the data is small. Hence the
names ``first'' and ``third quartiles.''
We can build a outlier-insensitive\index{insensitive to outliers} measure of spread\index{spread of a histogram, dataset, or distribution} out of the
quartiles.
\begin{definition}\label{def:IQR}\index{10700@$IQR$, inter-quartile range}
\index{inter-quartile range, $IQR$}
Given a quantitative dataset, its {\bf inter-quartile range} or {\bf $IQR$} is
defined by $IQR=Q_3-Q_1$.
\end{definition}
\begin{example}\label{eg:iqr1}
Yet again working with the statistics test scores data from
Example~\ref{eg:stemandleafplot}, we can count off the lower and upper half
datasets from the stem-and-leaf plot, being respectively
$$
\rm{Lower}=\{25, 25, 40, 58, 68, 69, 69, 70, 70, 71, 73, 73, 73, 74, 76\}
$$
and
$$
\ \ \ \ \ \rm{Upper} = \{77, 78, 80, 83, 83, 86, 87, 88, 90, 90, 90, 92, 93, 95, 100\}\ .
$$
It follows that, for these data, $Q_1=70$ and $Q_3=88$, so $IQR=18(=88-70)$.
\end{example}
\begin{example}\label{eg:iqr2}
Working again with the made-up data in Example~\ref{eg:subscriptssums}, which
was put into increasing order in Example~\ref{eg:median1}, we can see that the
lower half data is $\{-3.1415, .75\}$, the upper half is
$\{2, 17\}$, $Q_1=-1.19575(=\frac{-3.1415+.75}{2})$, $Q_3=9.5(=\frac{2+17}{2})$,
and $IQR=10.69575(=9.5-(-1.19575))$.
\end{example}
\subsection{Variance and Standard Deviation}\label{ssec:VarStdDev}
We've seen a crude measure of spread\index{spread of a histogram, dataset, or distribution}, like the crude measure
``mode''\index{mode} of central tendency. We've also seen a better measure
of spread, the $IQR$\index{10700@$IQR$, inter-quartile range}, which is
insensitive\index{insensitive to outliers} to outliers like the median\index{median} (and built out of
medians). It seems that, to fill out the parallel triple of measures, there
should be a measure of spread which is similar to the mean. Let's try to
build one.
Suppose the data is sample data. Then how far a particular data value $x_i$
is from the sample mean $\overline{x}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}
is just $x_i-\overline{x}$. So the mean displacement from the mean, the mean
of $x_i-\overline{x}$, should be a good measure of variability, shouldn't it?
Unfortunately, it turns out that the mean of
$x_i-\overline{x}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} is always 0. This
is because when $x_i>\overline{x}$, $x_i-\overline{x}$ is positive, while
when $x_i<\overline{x}$, $x_i-\overline{x}$ is negative, and it turns out
that the positives always exactly cancel the negatives (see if you can prove
this algebraically, it's not hard).
We therefore need to make the numbers
$x_i-\overline{x}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} positive before
taking their mean. One way to do this is to square them all. Then we take
something which is almost the mean of these squared numbers to get another
measure of spread\index{spread of a histogram, dataset, or distribution} or variability:
\begin{definition}\label{def:varstddevsamp}\index{variance}
\index{sample variance, $S_x^2$}\index{11200@$S_x^2$, sample variance}\index{11100@$S_x$, sample standard deviation}\index{standard deviation}\index{sample standard deviation, $S_x$}Given sample data $x_1, \dots, x_n$ from a sample of size
$n$, the {\bf sample variance} is defined as
$$
S_x^2 = \frac{\sum \left(x_i-\overline{x}\right)^2}{n-1} .
$$\index{11600@$\overline{x}$, sample mean}
Out of this, we then define the {\bf sample standard deviation}
$$
S_x = \sqrt{S_x^2} = \sqrt{\frac{\sum \left(x_i-\overline{x}\right)^2}{n-1}} .
$$\index{11600@$\overline{x}$, sample mean}
\end{definition}
Why do we take the square root in that sample standard deviation? The answer
is that the measure we build should have the property that if all the numbers
are made twice as big, then the measure of spread\index{spread of a histogram, dataset, or distribution} should also be
twice as big. Or, for example, if we first started working with data measured
in feet and then at some point decided to work in inches, the numbers would
all be 12 times as big, and it would make sense if the measure of spread were
also 12 times as big.
The variance does not have this property: if the data are all doubled, the
variance increases by a factor of 4. Or if the data are all multiplied by 12,
the variance is multiplied by a factor of 144.
If we take the square root of the variance, though, we get back to the nice
property of doubling data doubles the measure of spread\index{spread of a histogram, dataset, or distribution},
{\it etc.} For this reason, while we have defined the variance on its own
and some calculators, computers, and on-line tools will tell the variance
whenever you ask them to computer 1-variable statistics, we will in this
class only consider the variance a stepping stone on the way to the real
measure of spread of data, the standard deviation.
One last thing we should define in this section. For technical reasons that
we shall not go into now, the definition of standard deviation is slightly
different if we are working with population data and not sample data:
\begin{definition}\label{def:varstddevpop}\index{variance}\index{population variance, $\sigmaX^2$}\index{11500@$\sigmaX^2$, population variance}\index{11400@$\sigmaX$, population standard deviation}\index{standard deviation}\index{population standard deviation, $\sigmaX$}
Given data $x_1, \dots, x_n$ from an entire population of size $n$, the
{\bf population variance} is defined as
$$
\sigmaX^2 = \frac{\sum \left(x_i-\muX\right)^2}{n} .
$$
Out of this, we then define the {\bf population standard deviation}
$$
\sigmaX = \sqrt{\sigmaX^2} =
\sqrt{\frac{\sum \left(x_i-\muX\right)^2}{n}} .
$$
\end{definition}
[This letter $\sigma$ is the lower-case Greek letter sigma, whose upper case
$\Sigma$ you've seen elsewhere.]
Now for some examples. Notice that to calculate these values, we shall
always use an electronic tool like a calculator\index{calculator} or a
spreadsheet\index{spreadsheet|see{{\bf LibreOffice Calc} and {\bf MS Excel}}}
that has a built-in variance\index{variance} and standard
deviation\index{standard deviation} program -- experience shows that it is
nearly impossible to get all the calculations entered correctly into a
non-statistical calculator, so we shall not even try.
\begin{example}\label{eg:varstd1}
For the statistics test scores data from Example~\ref{eg:stemandleafplot},
entering them into a spreadsheet and using {\tt VAR.S}\index{VAR.S@{\tt VAR.S}, sample variance in spreadsheets} and
{\tt STDEV.S}\index{STDEV.S@{\tt STDEV.S}, sample standard deviation in spreadsheets} for
the sample variance and standard deviation\index{11100@$S_x$, sample standard deviation}\index{standard deviation}\index{sample standard deviation, $S_x$}\index{sample variance, $S_x^2$}\index{11200@$S_x^2$, sample variance} and
{\tt VAR.P}\index{VAR.P@{\tt VAR.P}, population variance in spreadsheets} and {\tt STDEV.P}\index{STDEV.P@{\tt STDEV.P}, population standard deviation in spreadsheets}
for population variance and population standard deviation\index{variance}\index{population variance, $\sigmaX^2$}\index{11500@$\sigmaX^2$, population variance}\index{11400@$\sigmaX$, population standard deviation}\index{standard deviation}\index{population standard deviation, $\sigmaX$}, we get
\begin{align*}
S_x^2 &= 331.98\\
S_x &= 18.22\\
\sigmaX^2 &= 330.92\\
\sigmaX &= 17.91
\end{align*}
\end{example}
\begin{example}\label{eg:varstd2}
Similarly, for the data in Example~\ref{eg:subscriptssums}, we find in the same
way that
\begin{align*}
S_x^2 &= 60.60\\
S_x &= 7.78\\
\sigmaX^2 &= 48.48\\
\sigmaX &= 6.96
\end{align*}
\end{example}
\subsection{Strengths and Weaknesses of These Measures of Spread}
\label{ssec:SaWoTMoS}
We have already said that
{\bf the range\index{range} is extremely sensitive to outliers}\index{sensitive to outliers}.\index{outlier}
The $IQR$,\index{10700@$IQR$, inter-quartile range}\index{inter-quartile range, $IQR$}
however, is built up out of medians\index{median}, used in different ways, so
{\bf the $IQR$ is insensitive to outliers}\index{insensitive to outliers}.\index{outlier}
The variance\index{variance}, both sample and population, is built using a
process quite like a mean\index{mean}, and in fact also has the mean itself
in the defining formula. Since the standard deviation\index{standard deviation}
in both cases is simply the square root of the variance, it follows that
{\bf the sample and population variances and standard deviations
are all sensitive to outliers}.\index{sensitive to outliers}\index{outlier}
This differing sensitivity and insensitivity to outliers\index{outlier} is the
main difference between the different measures of spread\index{spread of a histogram, dataset, or distribution} that we
have discussed in this section.
One other weakness, in a certain sense, of the
$IQR$\index{10700@$IQR$, inter-quartile range} is that there are several
different definitions in use of the quartiles, based upon whether the median
value is included or not when dividing up the data. These are called, for
example, {\tt QUARTILE.INC}\index{QUARTILE.INC@{\tt QUARTILE.INC}, quartile computation in spreadsheets} and {\tt QUARTILE.EXC}\index{QUARTILE.EXC@{\tt QUARTILE.EXC}, quartile computation in spreadsheets} on some spreadsheets. It
can then be confusing which one to use.
\subsection{A Formal Definition of Outliers -- the $1.5\,IQR$ Rule}
\label{ssec:AFDoO}
So far, we have said that outliers\index{outlier} are simply data that are
{\it atypical}. We need a precise definition that can be carefully checked.
What we will use is a formula (well, actually two formul{\ae}) that describe
that idea of an outlier being {\it far away from the rest of data}.
Actually, since outliers should be far away either in being significantly
bigger than the rest of the data or in being significantly smaller, we should
take a value on the upper side of the rest of the data, and another on the
lower side, as the starting points for this {\it far away}. We can't pick
the $x_{max}$\index{11700@$x_{max}$, maximum value in dataset} and
$x_{min}$\index{11800@$x_{min}$, minimum value in dataset} as those starting
points, since they will be the outliers themselves, as we have noticed. So
we will use our earlier idea of a value which is typical for the larger part
of the data, the quartile\index{quartile} $Q_3$\index{10900@$Q_3$, third quartile},
and $Q_1$\index{10800@$Q_1$, first quartile} for the corresponding
lower part of the data.
Now we need to decide how far is {\it far enough away} from those quartiles
to count as an outlier. If the data already has a lot of variation, then a
new data value would have to be quite far in order for us to be sure that it
is not out there just because of the variation already in the data. So our
measure of {\it far enough} should be in terms of a measure of
spread\index{spread of a histogram, dataset, or distribution} of the data.
Looking at the last section, we see that only the
$IQR$\index{10700@$IQR$, inter-quartile range} is a measure of
spread\index{spread of a histogram, dataset, or distribution} which is insensitive to outliers\index{insensitive to outliers} -- and we definitely
don't want to use a measure which is sensitive to the outliers\index{sensitive to outliers}, one which
would have been affected by the very outliers we are trying to define.
All this goes together in the following
\begin{definition}\label{def:outlier}\index{outlier}
[The
{\bf $1.5\,IQR$ Rule for Outliers}\index{10600@$1.5\,IQR$ Rule for Outliers}]
Starting with a quantitative dataset whose first and third
quartiles\index{quartile} are $Q_1$\index{10800@$Q_1$, first quartile} and
$Q_3$ and whose inter-quartile range is $IQR$, a data value $x$ is
[officially, from now on] called an {\bf outlier} if $xQ_3+1.5\,IQR$.
\end{definition}
\noindent
Notice this means that $x$ is not an outlier if it satisfies
$Q_1-1.5\,IQR\le x\le Q_3+1.5\,IQR$.
\begin{example}\label{eg:outliers1}
Let's see if there were any outliers in the test score dataset from
Example~\ref{eg:stemandleafplot}. We found the quartiles and $IQR$
in Example~\ref{eg:iqr1}, so from the $1.5\,IQR$ Rule, a data value $x$ will
be an outlier if
$$
xQ_3+1.5\,IQR=88+1.5\cdot18=115\ .
$$
Looking at the stemplot in Table~\ref{tab:stemplot1}, we conclude that the data
values $25$, $25$, and $40$ are the outliers in this dataset.
\end{example}
\begin{example}\label{eg:outliers2}
Applying the same method to the data in Example~\ref{eg:subscriptssums}, using
the quartiles and $IQR$ from Example~\ref{eg:iqr2}, the condition for an
outlier $x$ is
$$
xQ_3+1.5\,IQR=9.5+1.5\cdot10.69575=25.543625\ .
$$
Since none of the data values satisfy either of these conditions, there are
no outliers in this dataset.
\end{example}
\subsection{The Five-Number Summary and Boxplots}
\label{ssec:TF-NSaB}
We have seen that numerical summaries of quantitative data can be very useful
for quickly understanding (some things about) the data. It is therefore
convenient for a nice package of several of these
\begin{definition}\label{def:fivenumbersum}\index{five-number summary}
Given a quantitative dataset $\{x_1, \dots, x_n\}$, the
{\bf five-number summary}\footnote{Which might write 5N$\Sigma$ary for short.}
of this data is the set of values
$$
\left\{x_{min},\ \ Q_1,\ \ \mathrm{median},\ \ Q_3,\ \ x_{max}\right\}
$$\index{11800@$x_{min}$, minimum value in dataset}\index{10800@$Q_1$, first quartile}\index{median}\index{quartile}\index{10900@$Q_3$, third quartile}\index{11700@$x_{max}$, maximum value in dataset}
\end{definition}
\begin{example}\label{eg:5num1}
Why not write down the five-number summary for the same test score data we
saw in Example~\ref{eg:stemandleafplot}? We've already done most of the
work, such as calculating the min and max in Example~\ref{eg:spread1}, the
quartiles in Example~\ref{eg:iqr1}, and the median in Example~\ref{eg:median2},
so the five-number summary is
\begin{align*}
x_{min}&=25\\
Q_1&=70\\
\mathrm{median}&=76.5\\
Q_3&=88\\
x_{max}&=100
\end{align*}
\end{example}
\begin{example}\label{eg:5num2}
And, for completeness, the five number summary for the made-up data in
Example~\ref{eg:subscriptssums} is
\begin{align*}
x_{min}&=-3.1415\\
Q_1&=-1.9575\\
\mathrm{median}&=1\\
Q_3&=9.5\\
x_{max}&=17
\end{align*}
where we got the min and max from Example~\ref{eg:spread2}, the median from
Example~\ref{eg:median1}, and the quartiles from Example~\ref{eg:iqr2}.
\end{example}
\vskip5mm
As we have seen already several times, it is nice to have a both a numeric and
a graphical/visual version of everything. The graphical equivalent of the
five-number summary\index{five-number summary} is
\begin{definition}\label{def:boxplot}\index{boxplot, box-and-whisker plot}
Given some quantitative data, a {\bf boxplot} [sometimes
{\bf box-and-whisker plot}] is a graphical depiction of the five-number
summary, as follows:
\begin{itemize}
\item an axis is drawn, labelled with the variable of the study
\item tick marks and numbers are put on the axis, enough to allow the
following visual features to be located numerically
\item a rectangle (the {\it box}) is drawn parallel to the axis, stretching
from values $Q_1$\index{10800@$Q_1$, first quartile} to
$Q_3$\index{10900@$Q_3$, third quartile} on the axis
\item an addition line is drawn, parallel to the sides of the box at locations
$x_{min}$ and $x_{max}$, at the axis coordinate of the median\index{media} of
the data
\item lines are drawn parallel to the axis from the middle of sides of the box
at the locations $x_{min}$ and $x_{max}$
out to the axis coordinates
$x_{min}$\index{11800@$x_{min}$, minimum value in dataset} and
$x_{max}$\index{11700@$x_{max}$, maximum value in dataset}, where these
{\it whiskers} terminate in ``T''s.
\end{itemize}
\end{definition}
\begin{example}\label{eg:boxplot1}
A boxplot for the test score data we started using in
Example~\ref{eg:stemandleafplot} is easy to make after we found the
corresponding five-number summary in Example~\ref{eg:5num1}:
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{boxplot1.eps}
\end{center}
% this chart was made with datafile ``boxplot.data'' containing the
% lines
% 25
% 25
% 40
% 58
% 68
% 69
% 69
% 70
% 70
% 71
% 73
% 73
% 73
% 74
% 76
% 77
% 78
% 80
% 83
% 83
% 86
% 87
% 88
% 90
% 90
% 90
% 92
% 93
% 95
% 100
%%then gnuplot was run with commands:
% gnuplot> unset key
% gnuplot> unset xtics
% gnuplot> set xlabel "test score"
% gnuplot> set title "Boxplot for Test Score Data"
% gnuplot> set yrange [0:110]
% gnuplot> set style boxplot fraction 1
% gnuplot> plot 'boxplot.data' using (1.0):1 w boxplot
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as "boxplot1.eps"
\end{example}
Sometimes it is nice to make a version of the boxplot which is less sensitive
to outliers\index{sensitive to outliers}\index{outlier}. Since the endpoints of the whiskers are the only
parts of the boxplot which are sensitive in this way, they are all we have to
change:
\begin{definition}\label{def:boxplotwithOLs}
\index{boxplot, box-and-whisker plot!showing outliers}
Given some quantitative data, a {\bf boxplot showing outliers} [sometimes
{\bf box-and-whisker plot showing outliers}] is minor modification of the
regular boxplot, as follows
\begin{itemize}
\item the whiskers only extend as far as the largest and smallest non-outlier
data values
\item dots are put along the lines of the whiskers at the axis coordinates of
any outliers in the dataset
\end{itemize}
\end{definition}
\begin{example}\label{eg:boxplot2}
A boxplot showing outliers for the test score data we started using in
Example~\ref{eg:stemandleafplot} is only a small modification of the one we
just made in Example~\ref{eg:boxplot1}
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{boxplot2.eps}
\end{center}
% this chart was made with datafile ``boxplot.data'' containing the
% lines
% 25
% 25
% 40
% 58
% 68
% 69
% 69
% 70
% 70
% 71
% 73
% 73
% 73
% 74
% 76
% 77
% 78
% 80
% 83
% 83
% 86
% 87
% 88
% 90
% 90
% 90
% 92
% 93
% 95
% 100
%%then gnuplot was run with commands:
% gnuplot> unset key
% gnuplot> unset xtics
% gnuplot> set ylabel "test score"
% gnuplot> set title "Boxplot Showing Outliers for Test Score Data"
% gnuplot> set yrange [0:110]
% gnuplot> plot 'boxplot.data' using (1.0):1 w boxplot
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as "boxplot1.eps"
\end{example}
\ \vfill
\pagebreak
\section*{Exercises}
\begin{exercise}
A product development manager at the campus bookstore wants to make sure
that the backpacks being sold there are strong enough to carry the heavy books
students carry around campus. The manager decides she will collect some data
on how heavy are the bags/packs/suitcases students are carrying around at
the moment, by stopping the next 100 people she meets at the center of campus
and measuring.
What are the individuals in this study? What is the population? Is there a
sample -- what is it? What is the variable? What kind of variable is this?
\end{exercise}
\begin{exercise}
During a blood drive on campus, 300 donated blood. Of these, 136 had blood
of type $O$, 120 had blood of type $A$, 32 of type $B$, and the rest of type
$AB$.
Answer the same questions as in the previous exercise for this new situation.
Now make at least two visual representations of these data.
\end{exercise}
\begin{exercise}
Go to \href{https://en.wikipedia.org/wiki/Heights_of_presidents_and_presidential_candidates_of_the_United_States}{the {\bf Wikipedia} page} for ``Heights of
Presidents and Presidential Candidates of the United States'' and look only at
the heights of the presidents themselves, in centimeters ({\it cm}).
Make a histogram with these data using bins of width 5. Explain how you are
handling the edge cases in your histogram.
\end{exercise}
\vskip1cm
\begin{exercise}
Suppose you go to the supermarket every week for a year and buy a bag of
flour, packaged by a major national flour brand, which is labelled as weighing
$1kg$. You take the bag home and weigh it on an extremely accurate scale
that measures to the nearest ${1/100}^{th}$ of a gram. After the 52 weeks of
the year of flour buying, you make a histogram of the accurate weights of the
bags. What do you think that histogram will look like? Will it be symmetric
or skewed left or right (which one?), where will its center be, will it show
a lot of variation/spread or only a little? Explain why you think each of
the things you say.
What about if you buy a $1kg$ loaf of bread from the local artisanal bakery
-- what would the histogram of the accurate weights of those loaves look like
(same questions as for histogram of weights of the bags of flour)?
If you said that those histograms were symmetric, can you think of a
measurement you would make in a grocery store or bakery which would be
skewed; and if you said the histograms for flour and loaf weights were skewed,
can you think of one which would be symmetric? (Explain why, always, of
course.) [If you think one of the two above histograms was skewed and one was
symmetric (with explanation), you don't need to come up with another one here.]
\end{exercise}
\ \vfill\
\pagebreak
\begin{exercise}
Twenty sacks of grain weigh a total of $1003kg$. What is the mean\index{mean}
weight per sack?
Can you determine the median\index{median} weight per sack from the given
information? If so, explain how. If not, give two examples of datasets with
the same total weight be different medians.
\end{exercise}
\begin{exercise}
For the dataset $\{6, -2, 6, 14, -3, 0, 1, 4, 3, 2, 5\}$, which we will call
$DS_1$, find the mode(s)\index{mode}, mean\index{mean}, and
median\index{median}.
Define $DS_2$ by adding $3$ to each number in $DS_1$. What are the
mode(s)\index{mode}, mean\index{mean}, and median\index{median} of $DS_2$?
Now define $DS_3$ by subtracting $6$ from each number in $DS_1$. What are the
mode\index{mode}(s), mean\index{mean}, and median\index{median} of $DS_3$?
Next, define $DS_4$ by multiplying every number in $DS_1$ by 2. What are the
mode\index{mode}(s), mean\index{mean}, and median\index{median} of $DS_4$?
Looking at your answers to the above calculations, how do you think the
mode\index{mode}(s), mean\index{mean}, and median\index{median} of datasets
must change when you add, subtract, multiply or divide all the numbers by the
same constant? Make a specific conjecture!
\end{exercise}
\vskip1cm
\begin{exercise}
There is a very hard mathematics competition in which college students in the
US and Canada can participate called the
{\bf William Lowell Putnam Mathematical Competition}. It consists of a
six-hour long test with twelve problems, graded 0 to 10 on each problem, so the
total score could be anything from 0 to 120.
The median\index{median} score last year on the Putnam exam was 0 (as it often
is, actually). What does this tell you about the scores of the students who
took it? Be as precise as you can. Can you tell what fraction (percentage)
of students had a certain score or scores? Can you figure out what the
quartiles\index{quartile} must be?
\end{exercise}
\begin{exercise}
Find the range\index{range}, $IQR$\index{10700@$IQR$, inter-quartile range}, and
standard deviation\index{standard deviation} of the following sample dataset:
$$
DS_1 = \{0, 0, 0, 0, 0, .5, 1, 1, 1, 1, 1\}\quad .
$$
Now find the range\index{range}, $IQR$\index{10700@$IQR$, inter-quartile range},
and standard deviation\index{standard deviation} of the following sample data:
$$
DS_2 = \{0, .5, 1, 1, 1, 1, 1, 1, 1, 1, 1\}\quad .
$$
Next find the range\index{range}, $IQR$\index{10700@$IQR$, inter-quartile range},
and standard deviation\index{standard deviation} of the following sample data:
$$
DS_3 = \{0, 0, 0, 0, 0, 0, 0, 0, 0, .5, 1\}\quad .
$$
Finally, find the range\index{range},
$IQR$\index{10700@$IQR$, inter-quartile range}, and standard
deviation\index{standard deviation} of sample data $DS_4$,
consisting of 98 0s, one .5, and one 1 (so like $DS_3$ except with 0 occurring
98 times instead of 9 time).
\end{exercise}
\begin{exercise}
What must be true about a dataset if its range\index{range} is 0? Give the
most interesting example of a dataset with range of 0 and the property you
just described that you can think of.
What must be true about a dataset if its
$IQR$\index{10700@$IQR$, inter-quartile range} is 0? Give the most interesting
example of a dataset with $IQR$ of 0 and the property you just described that
you can think of.
What must be true about a dataset if its standard
deviation\index{standard deviation} is 0? Give the most interesting example
of a dataset with standard deviation of 0 and the property you just described
that you can think of.
\end{exercise}
\begin{exercise}
Here are some boxplots\index{boxplot, box-and-whisker plot} of test scores,
out of 100, on a standardized test given in five different classes -- the
same test, different classes. For each of these plots, $A - E$, describe
qualitatively (in the sense of \S\ref{ssec:HtTAH}) but in as much detail as
you can, what must have been the histogram\index{histogram} for the data
behind this boxplot. Also sketch a possible such histogram, for each case.
\vskip6mm
\begin{center}
\includegraphics[height=7cm,clip]{boxplot_exc.eps}
\end{center}
% this chart was made with datafile ``boxplot_exc.data'' containing the
% lines
% 5 5 5 5 5
% 10 42 6 6 58
% 15 43 7 7 57
% 20 44 8 8 56
% 25 45 9 9 55
% 30 46 10 10 54
% 35 47 11 11 53
% 40 48 12 12 52
% 45 49 13 13 51
% 50 50 50 50 50
% 55 51 87 51 87
% 60 52 88 52 88
% 65 53 89 53 89
% 70 54 90 54 90
% 75 55 91 55 91
% 80 56 92 56 92
% 85 57 93 57 93
% 90 58 94 58 94
% 95 95 95 95 95
%%then gnuplot was run with commands:
% gnuplot> unset key
% gnuplot> set xtics ("A" 1, "B" 2, "C" 3, "D" 4, "E" 5) scale 0.0
% gnuplot> set xtics nomirror
% gnuplot> set ylabel "test scores"
% gnuplot> set ytics nomirror
% gnuplot> set yrange [0:100]
% gnuplot> set style boxplot fraction 1
% gnuplot> set style data boxplot
% gnuplot> plot 'boxplot_exc.data' using (1):1, '' using (2):2, '' using (3):3, '' using (4):4, '' using (5):5
%then gimp was run and a screengrab was taken of the relevant portion of the
%gnuplot graph, and exported as "boxplot_exc.eps"
\end{exercise}
\chapter{Bi-variate Statistics: Basics}
\label{chap:2VS}
\section{Terminology: Explanatory/Response or Independent/Dependent}
\label{sec:TERoID}
All of the discussion so far has been for studies which have a single
variable. We may collect the values of this variable for a large population,
or at least the largest sample we can afford to examine, and we may display
the resulting data in a variety of graphical ways, and summarize it in a
variety of numerical ways. But in the end all this work can only show a
single characteristic of the individuals. If, instead, we want to study a
{\it relationship}, we need to collect two (at least) variables and develop
methods of descriptive statistics which show the relationships between the
values of these variables.
Relationships in data require at least two variables. While more complex
relationships can involve more, in this chapter we will start the project of
understanding {\it bivariate data}\index{bivariate data}, data where we make
two observations for each individual, where we have exactly two variables.
If there is a relationship between the two variables we are studying, the
most that we could hope for would be that that relationship is due to the
fact that one of the variables {\it causes} the other. In this situation,
we have special names for these variables
\begin{definition}\label{def:explanatoryresponsevars}
In a situation with bivariate data, if one variable can take on any value
without (significant) constraint it is called the {\bf independent variable}%
\index{independent variable}\index{variable!independent}, while the second
variable, whose value is (at least partially) controlled by the first, is called
the {\bf dependent variable}\index{dependent variable}%
\index{variable!dependent}.
Since the value of the dependent variable depends upon the value of the
independent variable, we could also say that it is explained by the
independent variable. Therefore the independent variable is also called the
{\bf explanatory variable}\index{explanatory variable}%
\index{variable!explanatory} and the dependent variable is then called the
{\bf response variable}\index{response variable}\index{variable!response}
Whenever we have bivariate data and we have made a choice of which variable
will be the independent and which the dependent, we write $x$ for the
independent and $y$ for the dependent variable.
\end{definition}
\begin{example}\label{eg:depindepvars1}
Suppose we have a large warehouse of many different boxes of products ready to
ship to clients. Perhaps we have packed all the products in boxes which are
perfect cubes, because they are stronger and it is easier to stack them
efficiently. We could do a study where
\begin{itemize}
\item the {\it individuals} would be the boxes of product;
\item the {\it population} would be all the boxes in our warehouse;
\item the {\it independent variable} would be, for a particular box, the
length of its side in {\it cm};
\item the {\it dependent variable} would be, for a particular box, the cost
to the customer of buying that item, in US dollars.
\end{itemize}
We might think that the size {\it determines} the cost, at least approximately,
because the larger boxes contain larger products into which went more raw
materials and more labor, so the items would be more expensive. So, at least
roughly, the size may be anything, it is a free or {\it independent} choice,
while the cost is (approximately) determined by the size, so the cost is
{\it dependent}. Otherwise said, the size {\it explains} and the cost is the
{\it response}. Hence the choice of those variables.
\end{example}
\begin{example}\label{eg:depindepvars3}
Suppose we have exactly the same scenario as above, but now we want to make the
different choice where
\begin{itemize}
\item the {\it dependent variable} would be, for a particular box, the volume
of that box.
\end{itemize}
\end{example}
There is one quite important difference between the two examples above: in
one case (the cost), knowing the length of the side of a box give us a hint
about how much it costs (bigger boxes cost more, smaller boxes cost less) but
this knowledge is imperfect (sometimes a big box is cheap, sometimes a small
box is expensive); while in the other case (the volume), knowing the length of
the side of the box perfectly tells us the volume. In fact, there is a
simple geometric formula that the volume $V$ of a cube of side length $s$ is
given by $V=s^3$.
This motivates a last preliminary definition
\begin{definition}\label{def:deterministic}\index{deterministic}
We say that the relationship between two variables is {\bf deterministic} if
knowing the value of one variable completely determines the value of the
other. If, instead, knowing one value does not completely determine the other,
we say the variables have a
{\bf non-deterministic relationship}.\index{non-deterministic}
\end{definition}
\ \vfill
\pagebreak
\section{Scatterplots}\label{sec:scatterplots}
When we have bivariate data, the first thing we should always do is draw a
graph of this data, to get some feeling about what the data is showing us and
what statistical methods it makes sense to try to use. The way to do this is
as follows
\begin{definition}\label{def:scatterplot}\index{scatterplot}
Given bivariate quantitative data, we make the {\bf scatterplot} of this
data as follows: Draw an $x$- and a $y$-axis, and label them with descriptions
of the independent and dependent variables, respectively. Then, for each
individual in the dataset, put a dot on the graph at location $(x,y)$, if
$x$ is the value of that individual's independent variable and $y$ the value
of its dependent variable.
\end{definition}
After making a scatterplot, we usually describe it qualitatively in three
respects:
\begin{definition}\label{def:scattershape}
If the cloud of data points in a scatterplot generally lies near some curve,
we say that the scatterplot has [approximately] that
{\bf shape}\index{shape!scatterplot}.
A common shape we tend to find in scatterplots is that it is
{\bf linear}\index{linear association}
If there is no visible shape, we say the scatterplot is
{\bf amorphous}\index{amorphous, for scatterplots or associations}, or
{\bf has no clear shape}.
\end{definition}
\begin{definition}\label{def:scatterstrength}
When a scatterplot has some visible shape -- so that we do not describe it as
amorphous -- how close the cloud of data points is to that curve is called the
{\bf strength}\index{strength of an association} of that association. In this
context, a {\bf strong}\index{strong association} [linear, {\it e.g.,}]
association means that the dots are close to the named curve [line,
{\it e.g.,}], while a {\bf weak}\index{weak association} association means
that the points do not lie particularly close to any of the named curves
[line, {\it e.g.,}].
\end{definition}
\begin{definition}\label{def:scatterdirection}
In case a scatterplot has a fairly strong linear association, the
{\bf direction}\index{direction of a linear association} of the association
described whether the line is increasing or decreasing. We say the
association is {\bf positive}\index{positive linear association} if the line
is increasing and {\bf negative}\index{negative linear association} if it is
decreasing.
\end{definition}
[Note that the words {\it positive} and {\it negative} here can be thought of
as describing the {\it slope}\index{slope of a line} of the line which we are
saying is the underlying relationship in the scatterplot.]
\ \vfill
\pagebreak
\section{Correlation}\label{sec:correlation}
As before (in \S\S\ref{sec:NDoDIMotC} and \ref{sec:NDoDIMoS}), when we moved
from describing histograms with words (like {\it symmetric}) to describing them
with numbers (like the {\it mean}), we now will build a numeric measure of
the strength and direction of a linear association in a scatterplot.
\begin{definition}\label{def:corrcoeff}\index{correlation coefficient, $r$}
Given bivariate quantitative data $\{(x_1,y_1), \dots , (x_n,y_n)\}$ the
{\bf [Pearson] correlation coefficient}\index{11000@$r$, correlation coefficient}
of this dataset is
$$
r=\frac{1}{n-1}\sum \frac{(x_i-\overline{x})}{s_x}\frac{(y_i-\overline{y})}{s_y}
$$
where $s_x$ and $s_y$ are the standard deviations of the $x$ and $y$,
respectively, datasets by themselves.
\end{definition}
We collect some basic information about the correlation coefficient in the
following
\begin{fact}\label{fact:corrcoefff}
For any bivariate quantitative dataset $\{(x_1,y_1), \dots ,(x_n,y_n)\}$ with
correlation coefficient $r$, we have
\begin{enumerate}
\item $-1\le r\le 1$ is always true;
\item if $|r|$ is near $1$ -- meaning that $r$ is near $\pm 1$ -- then the
linear association between $x$ and $y$ is {\it strong}
\item if $r$ is near $0$ -- meaning that $r$ is positive or negative, but near
$0$ -- then the linear association between $x$ and $y$ is {\it weak}
\item if $r>0$ then the linear association between $x$ and $y$ is positive,
while if $r<0$ then the linear association between $x$ and $y$ is
negative
\item $r$ is the same no matter what units are used for the variables $x$ and
$y$ -- meaning that if we change the units in either variable, $r$ will
not change
\item $r$ is the same no matter which variable is begin used as the explanatory
and which as the response variable -- meaning that if we switch the roles
of the $x$ and the $y$ in our dataset, $r$ will not change.
\end{enumerate}
\end{fact}
It is also nice to have some examples of correlation coefficients, such as
\vskip6mm
\begin{center}
\includegraphics[height=10cm,clip]{scatterrange.eps}
\end{center}
Many electronic tools which compute the correlation coefficient $r$ of a
dataset also report its square, $r^2$. There reason is explained in the
following
\begin{fact}\label{fact:rsquared}
If $r$ is the correlation coefficient between two variables $x$ and $y$ in some
quantitative dataset, then its square $r^2$ it the fraction (often described as
a percentage) of the variation of $y$ which is associated with variation in $x$.
\end{fact}
\begin{example}\label{eg:rsquared}
If the square of the correlation coefficient between the independent variable
{\it how many hours a week a student studies statistics} and the dependent
variable {\it how many points the student gets on the statistics final exam}
is $.64$, then 64\% of the variation in scores for that class is cause by
variation in how much the students study. The remaining 36\% of the variation
in scores is due to other random factors like whether a student was coming
down with a cold on the day of the final, or happened to sleep poorly the
night before the final because of neighbors having a party, or some other
issues different just from studying time.
\end{example}
\ \vfill
\pagebreak
\section*{Exercises}
\begin{exercise} Suppose you pick 50 random adults across the United States in
January 2017 and measure how tall they are. For each of them, you also get
accurate information about how tall their (biological) parents are. Now, using
as your individuals these 50 adults and as the two variables their heights and
the average of their parents' heights, make a sketch of what you think the
resulting scatterplot would look like. Explain why you made the choice you did
of one variable to be the explanatory and the other the response variable.
Tell what are the shape, strength, and direction you see in this scatterplot, if
it shows a deterministic or non-deterministic association, and why you think
those conclusions would be true if you were to do this exercise with real data.
Is there any time or place other than right now in the United States where you
think the data you would collect as above would result in a scatterplot that
would look fairly different in some significant way? Explain!
\end{exercise}
\begin{exercise} It actually turns out that it is not true that the more a
person works, the more they produce ... at least not always. Data on workers
in a wide variety of industries show that working more hours produces more of
that business's product for a while, but then after too many hours of work,
keeping on working makes for almost no additional production.
Describe how you might collect data to investigate this relationship, by
telling what individuals, population, sample, and variables you would use.
Then, assuming the truth of the above statement about what other research in
this area has found, make an example of a scatterplot that you think might
result from your suggested data collection.
\end{exercise}
\begin{exercise} Make a scatterplot of the dataset consisting of the following
pairs of measurements:
$$
\left\{(8,16), (9,9), (10,4), (11,1), (12,0), (13,1), (14,4), (15,9), (16,16)\right\} .
$$
You can do this quite easily by hand (there are only nine points!). Feel free
to use an electronic device to make the plot for you, if you have one you know
how to use, but copy the resulting picture into the homework you hand in, either
by hand or cut-and-paste into an electronic version.
Describe the scatterplot, telling what are the shape, strength, and direction.
What do you think would be the correlation coefficient of this dataset? As
always, explain all of your reasoning!
\end{exercise}
\ \vfill\
\pagebreak
\chapter{Linear Regression}
\label{chap:LR}
Quick review of equations for lines:
Recall the equation of a line is usually in the form $y=mx+b$, where $x$ and
$y$ are variables and $m$ and $b$ are numbers. Some basic facts about lines:
\begin{itemize}
\item If you are given a number for $x$, you can plug it in to the equation
$y=mx+b$ to get a number for $y$, which together give you a point with
coordinates $(x,y)$ that is on the line.
\item $m$ is the {\it slope}\index{slope of a line}, which tells how much the
line goes up (increasing $y$) for every unit you move over to the right
(increasing $x$) -- we often say that the value of the slope is
$m=\frac{rise}{run}$\index{rise over run|see{slope of a line}}. It can be
\begin{itemize}
\item {\it positive}, if the line is tilted up,
\item {\it negative}, if the line is tilted down,
\item {\it zero}, if the line is horizontal, and
\item {\it undefined}, if the line is vertical.
\end{itemize}
\item You can calculate the slope by finding the coordinates $(x_1,y_1)$ and
$(x_2,y_2)$ of any two points on the line and then $m=\frac{y_2-y_1}{x_2-x_1}$.
\item In particular, $x_2-x_1=1$, then $m=\frac{y_2-y_1}{1}=y_2-y_1$ -- so
if you look at how much the line goes up in each step of one unit to the right,
that number will be the slope $m$ (and if it goes {\it down}, the slope $m$ will
simply be negative). In other words, the slope answers the question ``for
each step to the right, how much does the line increase (or decrease)?''
\item $b$ is the
{\it $y$-intercept}\index{y-intercept of a line@$y$-intercept of a line}, which
tells the $y$-coordinate of the point where the line crosses the $y$-axis.
Another way of saying that is that $b$ is the $y$ value of the line when the
$x$ is $0$.
\end{itemize}
\section{The Least Squares Regression Line}\label{sec:TLSRL}
Suppose we have some bivariate quantitative data
$\{(x_1,y_1), \dots , (x_n,y_n)\}$ for which the correlation coefficient
indicates some linear association. It is natural to want to write down
explicitly the equation of the best line through the data -- the question is
what is this line. The most common meaning given to {\it best} in this
search for the line is {\it the line whose total square error is the smallest
possible.} We make this notion precise in two steps
\begin{definition}\label{def:residual}\index{residual, for data values and LSRLs}
Given a bivariate quantitative dataset
$\{(x_1,y_1), \dots , (x_n,y_n)\}$ and a candidate line $\widehat{y}=mx+b$
passing through this dataset, a {\bf residual} is the difference in
$y$-coordinates of an actual data point $(x_i,y_i)$ and the line's $y$ value at
the same $x$-coordinate. That is, if the $y$-coordinate of the line when
$x=x_i$ is $\widehat{y}_i=mx_i+b$, then the residual is the measure of error
given by $error_i=y_i-\widehat{y}_i$.
\end{definition}
Note we use the convention here and elsewhere of writing
$\widehat{y}$\index{11900@$\widehat{y}$,\ \ $y$ values on an approximating line}
for the $y$-coordinate on an approximating line, while the plain $y$ variable
is left for actual data values, like $y_i$.
Here is an example of what residuals look like
\vskip6mm
\begin{center}
\includegraphics[height=8cm,clip]{residual.eps}
\end{center}
Now we are in the position to state the
\begin{definition}\label{def:LSRL}\index{least squares regression line, LSRL}\index{LSRL, least squares regression line}
Given a bivariate quantitative dataset the {\bf least square regression line},
almost always abbreviated to {\bf LSRL}, is the line for which the sum of the
squares of the residuals is the smallest possible.
\end{definition}
\begin{fact}\label{fact:LSRLproperties}
If a bivariate quantitative dataset $\{(x_1,y_1), \dots , (x_n,y_n)\}$ has LSRL
given by $\widehat{y}=mx+b$, then
\begin{enumerate}
\item The slope of the LSRL is given by $m=r\frac{s_y}{s_x}$, where $r$ is the
correlation coefficient of the dataset.
\item The LSRL passes through the point $(\overline{x},\overline{y})$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}.
\item It follows that the $y$-intercept of the LSRL is given by
$b=\overline{y}-\overline{x}\,m=\overline{y}-\overline{x}\,r\,\frac{s_y}{s_x}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}.
\end{enumerate}
\end{fact}
It is possible to find the (coefficients of the) LSRL using the above
information, but it is often more convenient to use a
calculator\index{calculator} or other electronic tool. Such tools also make
it very easy to graph the LSRL right on top of the scatterplot -- although it
is often fairly easy to sketch what the LSRL will likely look like by just
making a good guess, using visual intuition, if the linear association is
strong (as will be indicated by the correlation coefficient).
\begin{example}\label{eg:LSRL}
Here is some data where the individuals are 23 students in a statistics class,
the independent variable is the students' total score on their homeworks, while
the dependent variable is their final total course points, both out of 100.
$$
\begin{matrix}
x:&65&65&50&53&59&92&86&84&29\\
y:&74&71&65&60&83&90&84&88&48\\
\ \\
x:&29& 9&64&31&69&10&57&81&81\\
y:&54&25&79&58&81&29&81&94&86\\
\ \\
x:&80&70&60&62&59\\
y:&95&68&69&83&70\\
\end{matrix}
$$
Here is the resulting scatterplot, made with
{\bf LibreOffice Calc}\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}(a free equivalent of
{\bf Microsoft Excel}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]})
\vskip6mm
\begin{center}
\includegraphics[height=8cm,clip]{scatter1.eps}
\end{center}
It seems pretty clear that there is quite a strong linear association between
these two variables, as is born out by the correlation coefficient,
$r=.935$ (computed with {\bf LibreOffice Calc}'s {\tt CORREL}\index{CORREL, correlation coefficient in LibreOffice Calc and MS Excel@{\tt CORREL}, correlation coefficient in spreadsheets}). Using then
{\tt STDEV.S}\index{STDEV.S@{\tt STDEV.S}, sample standard deviation in spreadsheets} and {\tt AVERAGE}\index{AVERAGE@{\tt AVERAGE}, sample mean in spreadsheets}, we find that the coefficients of the LSRL for this data,
$\widehat{y}=mx+b$ are
$$
m=r\frac{s_y}{s_x}=.935\frac{18.701}{23.207}=.754\qquad{\rm and}\qquad b=\overline{y}-\overline{x}\,m=71-58\cdot .754=26.976
$$
We can also use {\bf LibreOffice Calc}\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}'s {\tt Insert Trend Line}\index{Insert Trend Line@{\tt Insert Trend Line}, display LSRL in spreadsheet scatterplots}, with {\tt Show Equation}\index{Show Equation@{\tt Show Equation}, display LSRL equation in spreadsheets}, to get all this done
automatically. Note that when {\bf LibreOffice Calc} writes the equation of
the LSRL, it uses $f(x)$ in place of $\widehat{y}$, as we would.
\vskip6mm
\begin{center}
\includegraphics[height=9cm,clip]{scatter2.eps}
\end{center}
\end{example}
\ \vfill
\pagebreak
\section{Applications and Interpretations of LSRLs}\label{sec:AaIoLSRLs}
Suppose that we have a bivariate quantitative dataset
$\{(x_1,y_1), \dots , (x_n,y_n)\}$ and we have computed its correlation
coefficient $r$ and (the coefficients of) its LSRL $\widehat{y}=mx+b$.
What is this information good for?
The main use of the LSRL is described in the following
\begin{definition}\label{def:interpolation}\index{interpolation}
Given a bivariate quantitative dataset and associated LSRL with equation
$\widehat{y}=mx+b$, the process of guessing that the value of the dependent
variable in this relationship to have the value $mx_0+b$, for $x_0$ any
value for the independent variable which satisfies $x_{min}\le x_0\le x_{max}$,
is called {\bf interpolation}.
\end{definition}
The idea of interpolation is that we think the LSRL describes as well as
possible the relationship between the independent and dependent variables, so
that if we have a new $x$ value, we'll use the LSRL equation to predict what
would be our best guess of what would be the corresponding $y$. Note we
might have a new value of $x$ because we simply lost part of our dataset and
are trying to fill it in as best we can. Another reason might be that a new
individual came along whose value of the independent variable, $x_0$, was
typical of the rest of the dataset -- so the the very least
$x_{min}\le x_0\le x_{max}$ -- and we want to guess what will be the value of the
dependent variable for this individual before we measure it. (Or maybe we
cannot measure it for some reason.)
A common (but naive) alternate approach to interpolation for a value $x_0$ as
above might be to find two values $x_i$ and $x_j$ in the dataset which were
as close to $x_0$ as possible, and on either side of it (so $x_ix_{max}$],
is called {\bf extrapolation}.
\end{definition}
Extrapolation is considered a bad, or at least risky, practice. The idea is
that we used the evidence in the dataset $\{(x_1,y_1), \dots , (x_n,y_n)\}$ to
build the LSRL, but, by definition, all of this data lies in the interval on
the $x$-axis from $x_{min}$ to $x_{max}$. There is literally no evidence from
this dataset about what the relationship between our chosen explanatory and
response variables will be for $x$ outside of this interval. So in the absence
of strong reasons to believe that the precise linear relationship described
by the LSRL will continue for more $x$'s, we should not assume that it does,
and therefore we should not use the LSRL equation to guess values by
extrapolation.
The fact is, however, that often the best thing we can do with available
information when we want to make predictions out into uncharted territory on
the $x$-axis is extrapolation. So while it is perilous, it is reasonable to
extrapolate, so long as you are clear about what exactly you are doing.
\begin{example}\label{eg:extrapolation}
Using again the statistics students' homework and total course points data
from Example~\ref{eg:LSRL}, suppose the course instructor wanted to predict
what would be the total course points for a student who had earned a perfect
$100$ points on their homework. Plugging into the LSRL, this would have
yielded a guess of $.754\cdot100+26.976=102.376$. Of course, this would
have been impossible, since the maximum possible total course score was $100$.
Moreover, making this guess is an example of extrapolation, since the $x$
value of $100$ is beyond the largest $x$ value of $x_{max}=92$ in the dataset.
Therefore we should not rely on this guess -- as makes sense, since it is
invalid by virtue of being larger than $100$.
\end{example}
\subsection{Simpson's Paradox}\label{ssec:SP}
Our last caution is not so much a way using the LSRL can go wrong, but
instead a warning to be ready for something very counter-intuitive to happen
-- so counter-intuitive, in fact, that it is called a paradox.
It usually seems reasonable that if some object is cut into two pieces, both
of which have a certain property, then probably the whole object also has
that same property. But if the object in question is {\it a population} and
the property is {\it has positive correlation}, then maybe the unreasonable
thing happens.
\begin{definition}\label{def:simpsonsparadox}\index{Simpson's Paradox}
Suppose we have a population for which we have a bivariate quantitative
dataset. Suppose further that the population is broken into two (or more)
subpopulations for all of which the correlation between the two variables
is {\it positive}, but the correlation of the variables for the whole dataset
is {\it negative}. Then this situation is called {\bf Simpson's Paradox}.
[It's also called Simpson's Paradox if the role of {\it positive} and
{\it negative} is reversed in our assumptions.]
\end{definition}
The bad news is that Simpson's paradox can happen.
\begin{example}\label{eg:simpsons1}
Let $\Pp=\{(0,1), (1,0), (9,10), (10,9)\}$ be a bivariate dataset, which is
broken into the two subpopulations $\Pp_1=\{(0,1), (1,0)\}$ and
$\Pp_2=\{(9,10), (10,9)\}$. Then the correlation coefficients of both $\Pp_1$
and $\Pp_2$ are $r=-1$, but the correlation of all of $\Pp$ is $r=.9756$.
This is Simpson's Paradox!
\end{example}
Or, in applications, we can have situations like
\begin{example}\label{eg:simpsons2}
Suppose we collect data on two sections of a statistics course, in particular
on how many hours per work the individual students study for the course and
how they do in the course, measured by their total course points at the end of
the semester. It is possible that there is a strong positive correlation
between these variables for each section by itself, but there is a strong
negative correlation when we put all the students into one dataset. In other
words, it is possible that the rational advice, based on both individual
sections, is {\it study more and you will do better in the course}, but that
the rational advice based on all the student data put together is
{\it study less and you will do better}.
\end{example}
\ \vfill
\pagebreak
\section*{Exercises}
\begin{exercise} The age ($x$) and resting heart rate (RHR, $y$) were measured
for nine men, yielding this dataset:
$$
\begin{matrix}
x:&20&23&30&37&35&45&51&60&63\\
y:&72&71&73&74&74&73&75&75&77
\end{matrix}
$$
\end{exercise}
Make a scatterplot of these data.
Based on the scatterplot, what do you think the correlation coefficient $r$ will
be?
Now compute $r$.
Compute the LSRL for these data, write down its equation, and sketch it on top
of your scatterplot.
{\it [You may, of course, do as much of this with electronic tools as you
like. However, you should explain what tool you are using, how you used it,
and what it must have been doing behind the scenes to get the results which it
displayed and you are turning in.]}
\begin{exercise} Continuing with the data and computations of the previous
problem:
What percentage of the variation in RHR is associated with variation in age?
Write the following sentences with blanks filled in: ``If I measured the RHR of
a 55 year-old man, I would expect it to be \underbar{\hphantom{XXXX}}.
Making an estimate like this is called \underbar{\hphantom{XXXXXXXXXXX}}.''
Just looking at the equation of the LSRL, what does it suggest should be the
RHR of a newborn baby? Explain.
Also explain what an estimate like yours for the RHR of a baby is called. This
kind of estimate is considered a bad idea in many cases -- explain why in
general, and also use specifics from this particular case.
\end{exercise}
\begin{exercise} Write down a bivariate quantitative dataset for a population
of only two individuals whose LSRL is $\widehat{y}=2x-1$.
What is the correlation coefficient of your dataset?
Next, add one more point to the dataset in such a way that you don't change the
LSRL or correlation coefficient.
Finally, can your find a dataset with the same LSRL but having a larger
correlation coefficient than you just had?
{\it [Hint: fool around with modifications or additions to the datasets in
you already found in this problem, using an electronic tool to do all the
computational work. When you find a good one, write it down and explain what
you thinking was as you searched for it.]}
\end{exercise}
\part{Good Data}\label{part:GD}
It is something of an aphorism among statisticians that
\begin{quote}
\begin{center}
\textit{The plural of anecdote is not data.\footnote{It is hard to be certain
of the true origins of this phrase. The political scientist
Raymond Wolfinger is sometimes given credit \cite{pedersen2015former} --
for a version {\it without the} ``not,'' actually. Sometime later, then,
it became widespread with the ``not.''}\index{anecdote, not the singular of data}\index{data, not the plural of anecdote}}
\end{center}
\end{quote}
The distinction being emphasized here is between the information we might get
from a personal experience or a friend's funny story -- an anecdote -- and the
cold, hard, objective information on which we want to base our scientific
investigations of the world -- data.
In this Part, our goal is to discuss aspects of getting good data. It may
seem counter-intuitive, but the first step in that direction is to develop
some of the foundations of
{\it probability theory}\index{probability theory}, the mathematical study of
systems which are non-deterministic -- random -- but in a consistent way.
The reason for this is that the easiest and most reliable way to ensure
objectivity\index{objectivity} in data, to suppress personal choices which may
result in biased\index{bias} information from which we cannot draw universal,
scientific conclusions, is to collect your data {\it randomly}.
Randomness\index{randomness} is a tool which the scientist introduces
intentionally and carefully, as barrier against bias, in the collection of
high quality data. But this strategy only works if we can understand how to
extract precise information even in the presence of randomness -- hence the
importance of studying probability theory.
After a chapter on probability, we move on to a discussion of some
fundamentals of {\it experimental design}\index{experimental design} --
starting, not surprisingly, with {\it randomization}, but finishing with the
gold standard for experiments (on humans, at least):
{\it randomized, placebo-controlled, double-blind experiments [RCTs]}\index{randomized, placebo-controlled, double-blind experiment}\index{RCT, randomized, controlled trial}. Experiments
whose subjects are not humans share some, but not all, of these design goals
It turns out that, historically, a number of experiments with human subjects
have had very questionable moral foundations, so it is very important to stop,
as we do in the last chapter of this Part, to build a outline of
{\it experimental ethics}\index{experimental ethics}.
\chapter{Probability Theory}
\label{chap:PT}
We want to imagine doing an experiment in which there is no way to predict
what the outcome will be. Of course, if we stop our imagination there, there
would be nothing we could say and no point in trying to do any further analysis:
the outcome would just be whatever it wanted to be, with no pattern.
So let us add the additional assumption that while we {\it cannot predict} what
will happen any particular time we do the experiment, we {\it can predict}
general trends, in the long run, if we repeat the experiment many times. To be
more precise, we assume that, for any collection $E$ of possible outcomes of
the experiment there is a number $p(E)$ such that, no matter who does the
experiment, no matter when they do it, if they repeat the experiment many
times, the fraction of times they would have seen any of the outcomes of $E$
would be close to that number $p(E)$.
This is called the {\it frequentist}\index{frequentist approach to probability}
approach to the idea of probability. While it is not universally accepted
-- the {\it Bayesian}\index{Bayesian} alternative does in fact have many
adherents -- it has the virtue of being the most internally consistent way of
building a foundation for probability. For that reason, we will follow the
frequentist description of probability in this text.
Before we jump into the mathematical formalities, we should motivate two
pieces of what we just said. First, why talk about {\it sets} of outcomes of
the experiment instead of talking about individual outcomes? The answer is
that we are often interested in sets of outcomes, as we shall see later in
this book, so it is nice to set up the machinery from the very start to work
with such sets. Or, to give a particular concrete example, suppose you were
playing a game of cards and could see your hand but not the other players'
hands. You might be very interested in how likely is it that your hand is a
winning hand, {\it i.e.}, what is the likelihood of the set of all possible
configurations of all the rest of the cards in the deck and in your opponents'
hands for which what you have will be the winning hand? It is situations like
this which motivate an approach based on {\it sets} of outcomes of the
random experiment.
Another question we might ask is: where does our uncertainty about the
experimental results come from? From the beginnings of the scientific method
through the turn of the $20^{th}$ century, it was thought that this
uncertainty came from our incomplete knowledge of the system on which we were
experimenting. So if the experiment was, say, flipping a coin, the precise
amount of force used to propel the coin up into the air, the precise angular
motion imparted to the coin by its position just so on the thumbnail of the
person doing the flipping, the precise drag that the coin felt as it tumbled
through the air caused in part by eddies in the air currents coming from the
flap of a butterfly's wings\index{butterfly in the Amazon rainforest} in the
Amazon rainforest -- all of these things could significantly contribute to
changing whether the coin would eventually come up {\it heads} or {\it
tails}. Unless the coin-flipper was a robot operating in a vacuum, then,
there would just be no way to know all of these physical details with enough
accuracy to predict the toss.
After the turn of the $20^{th}$ century, matters got even worse (at least for
physical determinists): a new theory of physics came along then, called
{\it Quantum Mechanics}\index{quantum mechanics}, according to which true
randomness is built into the laws of the universe. For example, if you
have a very dim light source, which produces the absolutely smallest possible
``chunks'' of light (called {\it photons}\index{photon}), and you shine it
through first one polarizing filter and then see if it goes through a second
filter at a $45^\circ$ angle to the first, then half the photons will get through
the second filter, but there is {\it absolutely no way ever to predict whether
any particular photon will get though or not}. Quantum mechanics is full
of very weird, non-intuitive ideas, but it is one of the most well-tested
theories in the history of science, and it has passed every test.
\ \vfill
\pagebreak
\section{Definitions for Probability}\label{sec:defsforprob}
\subsection{Sample Spaces, Set Operations, and Probability Models}\label{ssec:SSSOaPMs}
Let's get right to the definitions.
\begin{definition}\label{def:outcomesamplespace}
Suppose we have a repeatable experiment we want to investigate
probabilistically. The things that happen when we do the experiment, the
results of running it, are called the [{\bf experimental}]
{\bf outcomes}\index{outcome of an experiment}. The set of all outcomes is
called the {\bf sample space}\index{sample space} of the experiment. We almost
always use the symbol $S$ for this sample space.
\end{definition}
\begin{example}\label{eg:sampspace1}
Suppose the experiment we are doing is ``flip a coin.'' Then the sample space
would be $S=\{H, T\}$.
\end{example}
\begin{example}\label{eg:sampspace2}
For the experiment ``roll a [normal, six-sided] die,'' the sample space would
be $S=\{1, 2, 3, 4, 5, 6\}$.
\end{example}
\begin{example}\label{eg:sampspace3}
For the experiment ``roll two dice,'' the sample space would be
\begin{align*}
S=\{&11, 12, 13, 14, 15, 16,\\
&21, 22, 23, 24, 25, 26\\
&31, 23, 33, 34, 35, 36\\
&41, 42, 43, 44, 45, 46\\
&51, 52, 53, 54, 55, 56\\
&61, 62, 63, 64, 65, 66\\
\end{align*}
where the notation ``$nm$'' means ``$1^{st}$ roll resulted in an $n$, $2^{nd}$ in
an $m$.''
\end{example}
\begin{example}\label{eg:samspace4}
Consider the experiment ``flip a coin as many times as necessary to see the
first {\it Head}.'' This would have the infinite sample space
$$
S=\{H, TH, TTH, TTTH, TTTTH, \dots \} \quad .
$$
\end{example}
\begin{example}\label{eg:sampspace5}
Finally, suppose the experiment is ``point a Geiger counter at a lump of
radioactive material and see how long you have to wait until the next click.''
Then the sample space $S$ is the set of all positive real numbers, because
potentially the waiting time could be any positive amount of time.
\end{example}
As mentioned in the chapter introduction, we are more interested in
\begin{definition}\label{def:event}
Given a repeatable experiment with sample space $S$, an
{\bf event}\index{event} is any collection of [some, all, or none of the]
outcomes in $S$; {\it i.e.,} an event is any
{\bf subset}\index{subset, $\subset$} $E$ of $S$, written
$E\subset S$\index{10400@$\subset$, subset}.
\end{definition}
There is one special set which is a subset of any other set, and therefore is
an event in any sample space.
\begin{definition}\label{def:emptyset}
The set $\{\}$ with no elements is called the
{\bf empty set}\index{empty set, $\emptyset$}\index{10100@$\emptyset$, empty set}, for which we use the notation $\emptyset$.
\end{definition}
\begin{example}\label{eg:events1}
Looking at the sample space $S=\{H, T\}$ in Example~\ref{eg:sampspace1}, it's
pretty clear that the following are all the subsets of $S$:
\begin{align*}
&\emptyset\\
&\{H\}\\
&\{T\}\\
&S\ [=\{H, T\}]
\end{align*}
\end{example}
Two parts of that example are always true: $\emptyset$ and $S$ are always
subsets of any set $S$.
Since we are going to be working a lot with events, which are subsets of a
larger set, the sample space, it is nice to have a few basic terms from set
theory:
\begin{definition}\label{def:complement}
Given a subset $E\subset S$ of a larger set $S$, the
{\bf complement of $E$}\index{complement, $E^c$}\index{10500@$E^c$, complement},
is the set $E^c=\{\text{all the elements of $S$ which {\it are not} in $E$}\}$.
\end{definition}
If we describe an event $E$ in words as all outcomes satisfies some property
$X$, the complementary event, consisting of all the outcomes not in $E$, can
be described as all outcomes which {\it don't} satisfy $X$. In other words,
we often describe the event $E^c$ as the event
``{\bf not} $E$.''\index{not, for an event}
\begin{definition}\label{def:union}
Given two sets $A$ and $B$, their
{\bf union}\index{union, $\cup$}\index{10300@$\cup$, union} is the set
$$
A\cup B = \{\text{all elements which are in $A$ or $B$ [or both]}\}\ .
$$
\end{definition}
Now if event $A$ is those outcomes having property $X$ and $B$ is those with
property $Y$, the event $A\cup B$, with all outcomes in $A$ together with all
outcomes in $B$ can be described as all outcomes satisfying $X$ or $Y$, thus
we sometimes pronounce the event ``$A\cup B$'' as
``$A$ {\bf or}\index{or, for events} $B$.''
\begin{definition}\label{def:intersection}
Given two sets $A$ and $B$, their
{\bf intersection}\index{intersection, $\cap$}\index{10200@$\cap$, intersection}
is the set
$$
A\cap B = \{\text{all elements which are in both $A$ and $B$}\}\ .
$$
\end{definition}
If, as before, event $A$ consists of those outcomes having property $X$ and
$B$ is those with property $Y$, the event $A\cap B$ will consist of those
outcomes which satisfy both $X$ and $Y$. In other words, ``$A\cap B$'' can
be described as ``$A$ {\bf and}\index{and, for events} $B$.''
Putting together the idea of intersection with the idea of that special subset
$\emptyset$ of any set, we get the
\begin{definition}\label{def:disjointME}
Two sets $A$ and $B$ are called {\bf disjoint}\index{disjoint events} if
$A\cap B=\emptyset$. In other words, sets are disjoint if they have nothing
in common.
A exact synonym for disjoint that some authors prefer is
{\bf mutually exclusive}\index{mutually exclusive events}. We will use both
terms interchangeably in this book.
\end{definition}
Now we are ready for the basic structure of probability.
\begin{definition}\label{def:probmodel}
Given a sample space $S$, a {\bf probability model}\index{probability model}
on $S$ is a choice of a real number $P(E)$ for every event $E\subset S$ which
satisfies
\begin{enumerate}
\item For all events $E$, $0\le P(E)\le 1$.
\item $P(\emptyset)=1$ and $P(S)=1$.
\item\label{item:complementrule} For all events $E$, $P(E^c)=1-P(E)$.
\item If $A$ and $B$ are any two {\it disjoint}\index{disjoint events} events, then
$P(A\cup B)=P(A)+P(B)$. [This is called the {\bf addition rule for disjoint events}\index{addition rule for disjoint events}.]
\end{enumerate}
\end{definition}
\subsection{Venn Diagrams}\label{ssec:VDs}
Venn diagrams\index{Venn diagram} are a simple way to display
subsets\index{subset, $\subset$} of a fixed set and to show the relationships
between these subsets and even the results of various set operations (like
{\it complement}\index{complement, $E^c$}, {\it union}\index{union, $\cup$},
and {\it intersection}\index{intersection, $\cap$}) on them. The primary
use we will make of Venn diagrams is for events\index{event} in a certain sample
space\index{sample space}, so we will use that terminology [even though the
technique has much wider application].
To make a Venn Diagram\index{Venn diagram}, {\it always start out by making a
rectangle to represent the whole sample space\index{sample space}}:
\begin{center}
\includegraphics[height=6cm,clip]{VennSampSpace.eps}
\end{center}
Within that rectangle, we make circles, ovals, or just blobs, to indicate that
portion of the sample space\index{sample space} which is some event $E$:
\begin{center}
\includegraphics[height=6cm,clip]{VennSampSpace1Circ.eps}
\end{center}
Sometimes, if the outcomes in the sample space\index{sample space} $S$ and in
the event\index{event} $A$ might be indicated in the different parts of the
Venn diagram\index{Venn diagram}. So, if $S=\{a, b, c, d\}$ and
$A=\{a, b\}\subset S$, we might draw this as
\begin{center}
\includegraphics[height=6cm,clip]{VennSampSpace1CircWDots.eps}
\end{center}
The {\it complement}\index{complement, $E^c$} $E^c$ of an event\index{event}
$E$ is easy to show on a Venn diagram\index{Venn diagram}, since it is simply
everything which is not in $E$:
\vskip6mm
\begin{center}
\begin{tabular}{ccc}
\includegraphics[height=4cm,clip]{VennSampSpace1FilledCir.eps}
&\ \ \ \ &
\includegraphics[height=4cm,clip]{VennSamSpComp1FilledCir.eps} \\
If the filled part here is $E$ & ... & then the filled part here is $E^c$
\end{tabular}
\end{center}
This can actually be helpful in figuring out what must be in $E^c$. In the
example above with $S=\{a, b, c, d\}$ and $A=\{a, b\}\subset S$, by looking at
what is in the shaded exterior part for our picture of $E^c$, we can see that
for that $A$, we would get $A^c=\{c, d\}$.
Moving now to set operations that work with two events\index{event}, suppose
we want to make a Venn diagram\index{Venn diagram} with events $A$ and $B$.
If we know these events are disjoint\index{disjoint events}, then we would make the
diagram as follows:
\begin{center}
\includegraphics[height=6cm,clip]{VennSamSp2CirDisj.eps}
\end{center}
while if they are known not to be disjoint, we would use instead this
diagram:
\begin{center}
\includegraphics[height=6cm,clip]{VnSamSp2CirNotDsj.eps}
\end{center}
For example, it $S=\{a, b, c, d\}$, $A=\{a, b\}$, and $B=\{b, c\}$, we would
have
\begin{center}
\includegraphics[height=6cm,clip]{VnSmSp2CrNtDsjDts.eps}
\end{center}
When in doubt, it is probably best to use the version with overlap, which then
could simply not have any points in it (or could have zero probability, when
we get to that, below).
Venn diagrams\index{Venn diagram} are very good at showing
unions\index{union, $\cup$}, and intersection\index{intersection, $\cap$}:
\begin{center}
\begin{tabular}{ccc}
\includegraphics[height=4cm,clip]{VnSmSp1FilledCirL.eps}
&\ \ \ \ &
\includegraphics[height=4cm,clip]{VnSmSp1FilledCirR.eps}\\
If the filled part here is $A$ & and & the filled part here is $B$\\
\\
& then & \\
\\
\includegraphics[height=4cm,clip]{VnRacoon.eps}
&\ \ \ \ &
\includegraphics[height=4cm,clip]{VnSmSpFilledOvlp.eps} \\
the filled part here is $A\cup B$ & and & the filled part here is $A\cap B$
\end{tabular}
\end{center}
Another nice thing to do with Venn diagrams\index{Venn diagram} is to use them
as a visual aid for probability computations. The basic idea is to make a
diagram showing the various events\index{event} sitting inside the usual
rectangle, which stands for the sample space\index{sample space}, and to put
numbers in various parts of the diagram showing the probabilities of those
events, or of the results of operations (unions\index{union, $\cup$},
intersection\index{intersection, $\cap$}, and
complement\index{complement, $E^c$}) on those events.
For example, if we are told that an event\index{event} $A$ has probability
$P(A)=.4$, then we can immediately fill in the $.4$ as follows:
\begin{center}
\includegraphics[height=6cm,clip]{VennSampSpace1Circ.4.eps}
\end{center}
But we can also put a number in the exterior of that circle which represents
$A$, taking advantage of the fact that that exterior is $A^c$ and the rule for
probabilities of complements (point (\ref{item:complementrule}) in
Definition~\ref{def:probmodel}) to conclude that the appropriate number
is $1-.4=.6$:
\begin{center}
\includegraphics[height=6cm,clip]{VennSampSpace1Circ.4.6.eps}
\end{center}
We recommend that, in a Venn diagram\index{Venn diagram} showing probability
values, {\it you always put a number in the region exterior to all of the
events\index{event} [but inside the rectangle indicating the sample
space\index{sample space}, of course]}.
Complicating a little this process of putting probability numbers in the
regions of a Venn diagram\index{Venn diagram} is the situation where we are
giving for both an event\index{event} and a subset{subset, $\subset$} of that
event. This most often happens when we are told probabilities both of some
events and of their intersection(s)\index{intersection, $\cap$}. Here is
an example:
\begin{example}\label{eg:probnumsinVDs}
Suppose we are told that we have two events $A$ and $B$ in the sample space
$S$, which satisfy $P(A)=.4$, $P(B)=.5$, and $P(A\cap B)=.1$. First of all,
we know that $A$ and $B$ are not disjoint, since if they were disjoint, that
would mean (by definition) that $A\cap B=\emptyset$, and since $P(\emptyset)=0$
but $P(A\cap B)\neq 0$, that is not possible. So we draw a Venn diagram that
we've see before:
\begin{center}
\includegraphics[height=6cm,clip]{VnSamSp2CirNotDsj.eps}
\end{center}
However, it would be unwise simply to write those given numbers $.4$, $.5$, and
$.1$ into the three central regions of this diagram. The reason is that the
number $.1$ is the probability of $A\cap B$, which is a part of $A$ already,
so if we simply write $.4$ in the rest of $A$, we would be counting that $.1$
for the $A\cap B$ twice. Therefore, before we write a number in the rest of
$A$, outside of $A\cap B$, we have to subtract the $.1$ for $P(A\cap B)$.
That means that the number which goes in the rest of $A$ should be $.4-.1=.3$.
A similar reasoning tells us that the number in the part of $B$ outside of
$A\cap B$, should be $.5-.1=.4$. That means the Venn diagram with all
probabilities written in would be:
\begin{center}
\includegraphics[height=6cm,clip]{egVnDiagWProbs.eps}
\end{center}
\end{example}
The approach in the above example is our second important recommendation for
who to put numbers in a Venn diagram\index{Venn diagram} showing probability
values: {\it always put a number in each region which corresponds to the
probability of that smallest connected region containing the number, not any
larger region}.
One last point we should make, using the same argument as in the above
example. Suppose we have events\index{event} $A$ and $B$ in a sample
space\index{sample space} $S$ (again). Suppose we are not sure if $A$ and $B$
are disjoint\index{disjoint events}, so we cannot use the addition rule for disjoint
events to compute $P(A\cup B)$. But notice that the events $A$ and $A^c$ are
disjoint, so that $A\cap B$ and $A^c\cap B$ are also disjoint and
$$
A = A\cap S = A\cap\left(B\cup B^c\right) =
\left(A\cap B\right)\cup\left(A\cap B^c\right)
$$
is a decomposition of the event $A$ into the two disjoint events $A\cap B$ and
$A^c\cap B$. From the addition rule for disjoint events, this means that
$$
P(A)=P(A\cap B)+P(A\cap B^c)\ .
$$
Similar reasoning tells us both that
$$
P(B)=P(A\cap B)+P(A^c\cap B)
$$
and that
$$
A\cup B=\left(A\cap B^c\right)\cup\left(A\cap B\right)\cup\left(A^c\cap B\right)
$$
is a decomposition of $A\cup B$ into disjoint pieces, so that
$$
P(A\cup B)=P(A\cap B^c)+P(A\cap B)+P(A^c\cap B)\ .
$$
Combining all of these equations, we conclude that
\begin{align*}
P(A)+P(B)-P(A\cap B)
&=P(A\cap B)+P(A\cap B^c)+P(A\cap B)+P(A^c\cap B)-P(A\cap B)\\
&= P(A\cap B^c)+P(A\cap B)+P(A^c\cap B) + P(A\cap B)-P(A\cap B)\\
&= P(A\cap B^c)+P(A\cap B)+P(A^c\cap B)\\
&= P(A\cup B)
\ .
\end{align*}
This is important enough to state as a
\begin{fact}\label{fact:probnondisjevents}
{\bf The Addition Rule for General Events}\index{Addition Rule for General Events} If $A$ and $B$ are events\index{event} in a sample space\index{sample space}
$S$ then we have the addition rule for their probabilities
$$
P(A\cup B) = P(A) + P(B) - P(A\cap B)\ .
$$
This rule is true whether or not $A$ and $B$ are disjoint\index{disjoint events}.
\end{fact}
\subsection{Finite Probability Models}\label{ssec:FPMs}
Here is a nice situation in which we can easily calculate a lot of
probabilities fairly easily: if the sample space $S$ of some experiment is
{\it finite}.\index{finite probability models}
So let's suppose the sample space consists of just the outcomes
$S=\{o_1, o_2, \dots, o_n\}$. For each of the outcomes, we can compute the
probability:
\begin{align*}
p_1 =& P(\{o_1\})\\
p_2 =& P(\{o_2\})\\
&\vdots\\
p_n =& P(\{o_n\})\\
\end{align*}
Let's think about what the rules for probability models tell us about these
numbers $p_1, p_2, \dots, p_n$. First of all, since they are each the
probability of an event, we see that
\begin{align*}
0\le &p_1\le 1\\
0\le &p_2\le 1\\
&\ \vdots\\
0\le &p_n\le 1
\end{align*}
Furthermore, since
$S=\{o_1, o_2, \dots, o_n\}=\{o_1\}\cup\{o_2\}\cup \dots \cup\{o_n\}$ and all
of the events $\{o_1\}, \{o_2\}, \dots, \{o_n\}$ are disjoint, by the addition
rule for disjoint events we have
\begin{align*}
1=P(S)&=P(\{o_1, o_2, \dots, o_n\})\\
&=P(\{o_1\}\cup\{o_2\}\cup \dots \cup\{o_n\})\\
&=P(\{o_1\})+P(\{o_2\})+ \dots +P(\{o_n\})\\
&=p_1+p_2+ \dots +p_n\ .
\end{align*}
The final thing to notice about this situation of a finite sample space is that
if $E\subset S$ is any event, then $E$ will be just a collection of some of
the outcomes from $\{o_1, o_2, \dots, o_n\}$ (maybe none, maybe all, maybe an
intermediate number). Since, again, the events like $\{o_1\}$ and $\{o_2\}$ and
so on are disjoint, we can compute
\begin{align*}
P(E) &= P(\{\text{the outcomes $o_j$ which make up $E$}\})\\
&= \sum \{\text{the $p_j$'s for the outcomes in $E$}\}\ .
\end{align*}
In other words
\begin{fact}\label{fact:finitemodel}
A probability model on a sample space $S$ with a finite number, $n$, of
outcomes, is nothing other than a choice of real numbers $p_1, p_2, \dots, p_n$,
all in the range from $0$ to $1$ and satisfying $p_1+p_2+ \dots +p_n=1$. For
such a choice of numbers, we can compute the probability of any event
$E\subset S$ as
$$
P(E) = \sum \{\text{the $p_j$'s corresponding to the outcomes $o_j$ which make up $E$}\}\ .
$$
\end{fact}
\begin{example}\label{eg:finiteprobmod1}
For the coin flip of Example~\ref{eg:sampspace1}, there are only the two
outcomes $H$ and $T$ for which we need to pick two probabilities, call them
$p$ and $q$. In fact, since the total must be $1$, we know that $p+q=1$ or,
in other words, $q=1-p$. The the probabilities for all events (which we
listed in Example~\ref{eg:events1}) are
\begin{align*}
P(\emptyset) &= 0\\
P(\{H\}) &= p\\
P(\{T\}) &= q = 1-p\\
P(\{H,T\}) &= p + q = 1
\end{align*}
What we've described here is, potentially, a
{\bf biased coin}\index{biased coin}\index{coin!biased}, since we are not
assuming that $p=q$ -- the probabilities of getting a head and a tail are not
assumed to be the same. The alternative is to assume that we have a
{\bf fair coin}\index{fair coin}\index{coin!fair}, meaning that $p=q$. Note
that in such a case, since $p+q=1$, we have $2p=1$ and so $p=1/2$. That is,
the probability of a head (and, likewise, the probability of a tail) in a
single throw of a fair coin is $1/2$.
\end{example}
\begin{example}\label{eg:finiteprobmod2}
As in the previous example, we can consider the die of
Example~\ref{eg:sampspace2} to a {\bf fair die}, meaning that the individual
face probabilities are all the same. Since they must also total to $1$ (as we
saw for all finite probability models), it follows that
$$
p_1 = p_2 = p_3 = p_4 = p_5 = p_6 = 1/6 .
$$
We can then use this basic information and the formula (for $P(E)$) in
Fact~\ref{fact:finitemodel} to compute the probability of any event of interest,
such as
$$
P(\text{``roll was even''}) = P(\{2, 4, 6\}) = \frac16 + \frac16 + \frac16
= \frac36 = \frac12\ .
$$
\end{example}
We should immortalize these last two examples with a
\begin{definition}\label{def:fair}
When we are talking about dice, coins, individuals for some task, or another
small, practical, finite experiment, we use the term
{\bf fair}\index{fair, in general} to indicate that the probabilities of all
individual outcomes are equal (and therefore all equal to the the number $1/n$,
where $n$ is the number of outcomes in the sample space). A more technical
term for the same idea is {\bf equiprobable}\index{equiprobable}, while a more
casual term which is often used for this in very informal settings is
``{\bf at random}''\index{at random@``at random''} (such as ``pick a card
{\it at random} from this deck'' or ``pick a random patient from the study
group to give the new treatment to...'').
\end{definition}
\begin{example}\label{eg:finiteprobmod3}
Suppose we look at the experiment of Example~\ref{eg:sampspace3} and add the
information that the two dice we are rolling are {\it fair}. This actually
isn't quite enough to figure out the probabilities, since we also have to
assure that the fair rolling of the first die doesn't in any way affect the
rolling of the second die. This is technically the requirement that the two
rolls be {\it independent}\index{independent events}, but since we won't
investigate that carefully until \S\ref{sec:condprob}, below, let us instead
here simply say that we assume the two rolls are fair and are in fact
completely uninfluenced by anything around them in the world including each
other.
What this means is that, in the long run, we would expect the first die to show
a $1$ roughly ${\frac16}^{th}$ of the time, and in the very long run, the second
die would show a $1$ roughly ${\frac16}^{th}$ of {\it those} times. This means
that the outcome of the ``roll two dice'' experiment should be $11$ with
probability $\frac{1}{36}$ -- and the same reasoning would show that all of
the outcomes have that probability. In other words, this is an equiprobable
sample space with $36$ outcomes each having probability $\frac{1}{36}$.
Which in turn enables us to compute any probability we might like, such as
\begin{align*}
P(\text{``sum of the two rolls is $4$''}) &= P(\{13, 22, 31\})\\
&= \frac{1}{36} + \frac{1}{36} + \frac{1}{36}\\
&= \frac{3}{36}\\
&= \frac{1}{12}\ .
\end{align*}
\end{example}
\ \vfill
\pagebreak
\section{Conditional Probability}\label{sec:condprob}
We have described the whole foundation of the theory of probability as coming
from {\it imperfect knowledge}\index{imperfect knowledge}, in the sense that
we don't know for sure if an event $A$ will happen any particular time we do
the experiment but we do know, in the long run, in what fraction of times $A$
will happen. Or, at least, we claim that there is some number $P(A)$ such
that after running the experiment $N$ times, out of which $n_A$ of these times
are when $A$ happened, $P(A)$ is approximately $n_A/N$ (and this ratio gets
closer and closer to $P(A)$ as $N$ gets bigger and bigger).
But what if we have {\it some} knowledge? In particular, what happens if we
know for sure that the event $B$ has happened -- will that influence our
knowledge of whether $A$ happens or not? As before, when there is randomness
involved, we cannot tell for sure if $A$ will happen, but we hope that, given
the knowledge that $B$ happened, we can make a more accurate guess about the
probability of $A$.
\begin{example}\label{eg:condprob1}
If you pick a person at random in a certain country on a particular date, you
might be able to estimate the probability that the person had a certain height
if you knew enough about the range of heights of the whole population of that
country. [In fact, below we will make estimates of this kind.] That is,
if we define the event
$$
A=\text{``the random person is taller than $1.829$ meters ($6$ feet)''}
$$
then we might estimate $P(A)$.
But consider the event
$$
B=\text{``the random person's parents were both taller than $1.829$ meters''}\ .
$$
Because there is a genetic component to height, if you know that $B$ happened,
it would change your idea of how likely, given that knowledge, that $A$
happened. Because genetics are not the only thing which determines a person's
height, you would not be certain that $A$ happened, even given the knowledge
of $B$.
\end{example}
Let us use the frequentist approach to derive a formula for this kind of
{\it probability of $A$ given that $B$ is known to have happened}. So think
about doing the repeatable experiment many times, say $N$ times. Out of all
those times, some times $B$ happens, say it happens $n_B$ times. Out of
{\it those} times, the ones where $B$ happened, sometimes $A$ also happened.
These are the cases where both $A$ and $B$ happened -- or, converting this to
a more mathematical descriptions, the times that $A\cap B$ happened -- so we
will write it $n_{A\cap B}$.
We know that the probability of $A$ happening in the cases where we know for
sure that $B$ happened is approximately $n_{A\cap B}/n_B$. Let's do that
favorite trick of multiplying and dividing by the same number, so finding that
the probability in which we are interested is approximately
$$
\frac{n_{A\cap B}}{n_B} = \frac{n_{A\cap B}\cdot N}{N\cdot n_B}
= \frac{n_{A\cap B}}{N}\cdot\frac{N}{n_B}
= \frac{n_{A\cap B}}{N} \Bigg/ \frac{n_B}{N}
\approx P(A\cap B) \Big/ P(B)
$$
Which is why we make the
\begin{definition}\label{def:condprob}
The {\bf conditional probability}\index{conditional probability, $P(A\mid B)$}
{\bf of the event $A$ given}\index{given@``given,'' the known event in conditional probability}
{\bf the event $B$} is
$$
P(A|B) = \frac{P(A\cap B)}{P(B)}\ .
$$\index{10780@$P(A\mid B)$, conditional probability}
Here $P(A|B)$ is pronounced {\it the probability of $A$ given $B$}.
\end{definition}
Let's do a simple
\begin{example}\label{eg:condprob2}
Building off of Example~\ref{eg:finiteprobmod2}, note that the probability
of rolling a $2$ is $P(\{2\})=1/6$ (as is the probability of rolling any other
face -- it's a {\it fair die}). But suppose that you were told that the roll
was even, which is the event $\{2, 4, 6\}$, and asked for the
probability that the roll was a $2$ given this prior knowledge. The answer
would be
$$
P(\{2\}\mid\{2, 4, 6\})=\frac{P(\{2\}\cap\{2, 4, 6\})}{P(\{2, 4, 6\})}
=\frac{P(\{2\})}{P(\{2, 4, 6\})} = \frac{1/6}{1/2} = 1/3\ .
$$
In other words, the probability of rolling a $2$ on a fair die with no other
information is $1/6$, which the probability of rolling a $2$ given that we
rolled an even number is $1/3$. So the probability doubled with the given
information.
Sometimes the probability changes even more than merely doubling: the
probability that we rolled a $1$ with no other knowledge is $1/6$, while
the probability that we rolled a $1$ given that we rolled an even number is
$$
P(\{1\}\mid\{2, 4, 6\})=\frac{P(\{1\}\cap\{2, 4, 6\})}{P(\{2, 4, 6\})}
=\frac{P(\emptyset)}{P(\{2, 4, 6\})} = \frac{0}{1/2} = 0\ .
$$
\end{example}
But, actually, sometimes the conditional probability for some event is the
same as the unconditioned probability. In other words, sometimes knowing that
$B$ happened doesn't change our estimate of the probability of $A$ at all,
they are no really related events, at least from the point of view of
probability. This motivates the
\begin{definition}\label{def:independent}
Two events $A$ and $B$ are called {\bf independent}\index{independent events} if
$P(A\mid B)=P(A)$.
\end{definition}
Plugging the defining formula for $P(A\mid B)$ into the definition of
{\it independent}, it is easy to see that
\begin{fact}\label{fact:indepevents}
Events $A$ and $B$ are independent if and only if $P(A\cap B)=P(A)\cdot P(B)$.
\end{fact}
\begin{example}\label{eg:condprob3}
Still using the situation of Example~\ref{eg:finiteprobmod2}, we saw in
Example~\ref{eg:condprob2} that the events $\{2\}$ and $\{2, 3, 4\}$ are
not independent since
$$
P(\{2\}) = 1/6 \neq 1/3 = P(\{2\}\mid\{2, 4, 6\})
$$
nor are $\{1\}$ and $\{2, 3, 4\}$, since
$$
P(\{1\}) = 1/6 \neq 0 = P(\{1\}\mid\{2, 4, 6\})\ .
$$
However, look at the events $\{1, 2\}$ and $\{2, 4, 6\}$:
\begin{align*}
P(\{1, 2\}) = P(\{1\}) + P(\{2\}) &= 1/6 + 1/6\\
&= 1/3\\
&= \frac{1/6}{1/2}\\
&= \frac{P(\{1\})}{P(\{2, 4, 6\})}\\
&= \frac{P(\{1, 2\}\cap\{2, 4, 6\})}{P(\{2, 4, 6\})}\\
&= P(\{1, 2\}\mid\{2, 4, 6\})
\end{align*}
which means that they are independent!
\end{example}
\begin{example}\label{eg:condprob4}
We can now fully explain what was going on in Example~\ref{eg:finiteprobmod3}.
The two fair dice were supposed to be rolled in a way that the first roll
had no effect on the second -- this exactly means that the dice were rolled
{\it independently}. As we saw, this then means that each individual outcome
of sample space $S$ had probability $\frac{1}{36}$. But the first roll having
any particular value is independent of the second roll having another,
{\it e.g.}, if $A=\{11, 12, 13, 14, 15, 16\}$ is the event in that sample
space of getting a $1$ on the first roll and $B=\{14, 24, 34, 44, 54, 64\}$ is
the event of getting a $4$ on the second roll, then events $A$ and $B$ are
independent, as we check by using Fact~\ref{fact:indepevents}:
\begin{align*}
P(A\cap B) &= P(\{14\})\\
&= \frac{1}{36}\\
&= \frac16\cdot\frac16\\
&= \frac{6}{36}\cdot\frac{6}{36}\\
&=P(A)\cdot P(B)\ .
\end{align*}
On the other hand, the event ``the sum of the rolls is $4$,'' which is
$C=\{13, 22, 31\}$ as a set, {\it is not independent} of the value of the
first roll, since $P(A\cap C)=P(\{13\})=\frac{1}{36}$ but
$P(A)\cdot P(C)=\frac{6}{36}\cdot\frac{3}{36}=\frac16\cdot\frac{1}{12}=\frac{1}{72}$.
\end{example}
\ \vfill
\pagebreak
\section{Random Variables}\label{sec:RVs}
\subsection{Definition and First Examples}\label{ssec:DoRVsaFEs}
Suppose we are doing a random experiment and there is some consequence of the
result in which we are interested that can be measured by a number. The
experiment might be playing a game of chance and the result could be how
much you win or lose depending upon the outcome, or the experiment could be
which part of the drives' manual you randomly choose to study and the result
how many points we get on the driver's license test we make the next day, or
the experiment might be giving a new drug to a random patient in medical study
and the result would be some medical measurement you make after treatment
(blood pressure, white blood cell count, whatever), {\it etc.} There is a
name for this situation in mathematics
\begin{definition}\label{def:RV}
A choice of a number for each outcome of a random experiment is called a {\bf
random variable} [{\bf RV}]\index{random variable, RV}\index{RV, random
variable}. If the values an RV takes can be counted, because they are
either finite or countably infinite\index{countably infinite}\footnote{There
many kinds of infinity in mathematics -- in fact, an infinite number of them.
The smallest is an infinity that can be counted, like the whole numbers. But
then there are many larger infinities, describing sets that are too big even
to be counted, like the set of all real numbers.} in number, the RV is called
{\bf discrete}\index{discrete random variable}; if, instead, the RV takes on
all the values in an interval of real numbers, the RV is called
{\bf continuous}\index{continuous random variable}.
We usually use capital letters to denote RVs and the corresponding lowercase
letter to indicate a particular numerical value the RV might have, like $X$
and $x$.
\end{definition}
\begin{example}\label{eg:RV1}
Suppose we play a silly game where you pay me \$5 to play, then I flip a fair
coin\index{fair coin}\index{coin!fair} and I give you \$10 if the coin comes
up heads and \$0 if it comes up tails. Then your net winnings, which would
be +\$5 or -\$5 each time you play, are a random variable. Having only two
possible values, this RV is certainly discrete.
\end{example}
\begin{example}\label{eg:RV2}
Weather phenomena vary so much, due to such small effects -- such as the famous
butterfly flapping its wings in the Amazon rain forest causing a hurricane in
North America -- that they appear to be a random phenomenon. Therefore,
observing the temperature at some weather station is a continuous random
variable whose value can be any real number in some range like $-100$ to
$100$ (we're doing {\it science}, so we use ${}^\circ C$).
\end{example}
\begin{example}\label{eg:RV3}
Suppose we look at the ``{\it roll two fair dice independently}'' experiment
from Example~\ref{eg:condprob4} and Example~\ref{eg:finiteprobmod3}, which
was based on the probability model in Example~\ref{eg:finiteprobmod3} and
sample space in Example~\ref{eg:sampspace3}. Let us consider in this situation
the random variable $X$ whose value for some pair of dice rolls is the sum of
the two numbers showing on the dice. So, for example, $X(11)=2$, $X(12)=3$,
{\it etc.}
In fact, let's make a table of all the values of $X$:
\begin{align*}
X(11) &= 2\\
X(21) = X(12) &= 3\\
X(31) = X(22) = X(13) &=4\\
X(41) = X(32) = X(23) = X(14) &= 5\\
X(51) = X(42) = X(33) = X(24) = X(15) &= 6\\
X(61) = X(52) = X(43) = X(34) = X(25) = X(16) &= 7\\
X(62) = X(53) = X(44) = X(35) = X(26) &= 8\\
X(63) = X(54) = X(45) = X(36) &= 9\\
X(64) = X(55) = X(46) &= 10\\
X(65) = X(56) &= 11\\
X(66) &= 12\\
\end{align*}
\end{example}
\subsection{Distributions for Discrete RVs}\label{ssec:D4DRVs}
The first thing we do with a random variable, usually, is talk about the
probabilities associate with it.
\begin{definition}\label{def:RVdistribution}
Given a discrete RV $X$, its {\bf distribution}\index{distribution} is a
list of all of the values $X$ takes on, together with the probability of it
taking that value.
\end{definition}
[Note this is quite similar to Definition~\ref{def:distribution} -- because it
is essentially the same thing.]
\begin{example}\label{eg:distribution1}
Let's look at the RV, which we will call $X$, in the silly betting game of
Example~\ref{eg:RV1}. As we noticed when we first defined that game, there
are two possible values for this RV, \$5 and -\$5. We can actually think of
``$X=5$'' as describing an event, consisting of the set of all outcomes of the
coin-flipping experiment which give you a net gain of \$5. Likewise,
``$X=-5$'' describes the event consisting of the set of all outcomes which give
you a net gain of -\$5. These events are as follows:
\begin{center}
\begin{tabular}{ r | c}
$x$\ \ &\ $\begin{matrix}\text{Set of outcomes $o$}\\\text{such that $X(o)=x$}\end{matrix}$\\
\hline
$5$\ \ & $\{H\}$\\
$-5$\ \ & $\{T\}$\\
\end{tabular}
\end{center}
Since it is a fair coin\index{fair coin}\index{coin!fair} so the probabilities
of these events are known (and very simple), we conclude that the distribution
of this RV is the table
\begin{center}
\begin{tabular}{ r | c}
$x$\ \ \ &\ $P(X=x)$\\
\hline
$5$\ \ & $1/2$\\
$-5$\ \ & $1/2$
\end{tabular}
\end{center}
\end{example}
\begin{example}\label{eg:distribution2}
What about the $X=\text{''{\it sum of the face values}''}$ RV on the
``{\it roll two fair dice, independently}'' random experiment from
Example~\ref{eg:RV3}? We have actually already done most of the work,
finding out what values the RV can take and which outcomes cause each of those
values. To summarize what we found:
\begin{center}
\begin{tabular}{ r | l}
$x$\ \ &\ $\begin{matrix}\text{Set of outcomes $o$}\\\text{such that $X(o)=x$}\end{matrix}$\\
\hline
$2$\ \ & $\{11\}$\\
$3$\ \ & $\{21, 12\}$\\
$4$\ \ & $\{31, 22, 13\}$\\
$5$\ \ & $\{41, 32, 23, 14\}$\\
$6$\ \ & $\{51, 42, 33, 24, 15\}$\\
$7$\ \ & $\{61, 52, 43, 34, 25, 16\}$\\
$8$\ \ & $\{62, 53, 44, 35, 26\}$\\
$9$\ \ & $\{63, 54, 45, 36\}$\\
$10$\ \ & $\{64, 55, 46\}$\\
$11$\ \ & $\{65, 56\}$\\
$12$\ \ & $\{66\}$\\
\end{tabular}
\end{center}
But we have seen that this is an equiprobable situation, where the probability
of any event $A$ contain $n$ outcomes is $P(A)=n\cdot1/36$, so we can
instantly fill in the distribution table for this RV as
\begin{center}
\begin{tabular}{ r | l}
$x$\ \ &\ $P(X=x)$\\
\hline
$2$\ \ & $\frac{1}{36}$\\
$3$\ \ & $\frac{2}{36}=\frac{1}{18}$\\
$4$\ \ & $\frac{3}{36}=\frac{1}{12}$\\
$5$\ \ & $\frac{4}{36}=\frac{1}{6}$\\
$6$\ \ & $\frac{5}{36}$\\
$7$\ \ & $\frac{6}{36}=\frac{1}{6}$\\
$8$\ \ & $\frac{5}{36}$\\
$9$\ \ & $\frac{4}{36}=\frac{1}{6}$\\
$10$\ \ & $\frac{3}{36}=\frac{1}{12}$\\
$11$\ \ & $\frac{2}{36}=\frac{1}{18}$\\
$12$\ \ & $\frac{1}{36}$\\
\end{tabular}
\end{center}
\end{example}
One thing to notice about distributions is that if we make a preliminary table,
as we just did, of the events consisting of all outcomes which give a
particular value when plugged into the RV, then we will have a collection of
disjoint events which exhausts all of the sample space. What this means is
that the sum of the probability values in the distribution table of an RV is
the probability of the whole sample space of that RV's experiment. Therefore
\begin{fact}\label{fact:distssum2one}
The sum of the probabilities in a distribution table for a random variable
must always equal $1$.
\end{fact}
It is quite a good idea, whenever you write down a distribution, to check that
this Fact is true in your distribution table, simply as a sanity check against
simple arithmetic errors.
\subsection{Expectation for Discrete RVs}\label{ssec:expectation4DRVs}
Since we cannot predict what exactly will be the outcome each time we perform
a random experiment, we cannot predict with precision what will be the value
of an RV on that experiment, each time. But, as we did with the basic idea of
probability, maybe we can at least learn something from the long-term trends.
It turns out that it is relatively easy to figure out the mean value of an RV
over a large number of runs of the experiment.
Say $X$ is a discrete RV, for which the distribution tells us that $X$ takes
the values $x_1, \dots, x_n$, each with corresponding probability
$p_1, \dots, p_n$. Then the frequentist view of probability says that the
probability $p_i$ that $X=x_i$ is (approximately) $n_i/N$, where $n_i$ is the
number of times $X=x_i$ out of a large number $N$ of runs of the experiment.
But if
$$
p_i = n_i/N
$$
then, multiplying both sides by $N$,
$$
n_i = p_i\,N \ .
$$
That means that, out of the $N$ runs of the experiment, $X$ will have the value
$x_1$ in $p_1\,N$ runs, the value $x_2$ in $p_2\,N$ runs, {\it etc.} So the
sum of $X$ over those $N$ runs will be
$$
(p_1\,N)x_1+(p_2\,N)x_2 + \dots + (p_n\,N)x_n\ .
$$
Therefore the mean value of $X$ over these $N$ runs will be the total divided
by $N$, which is $p_1\,x_1 + \dots + p_n x_n$. This motivates the definition
\begin{definition}\label{def:expectation}
Given a discrete RV $X$ which takes on the values $x_1, \dots, x_n$ with
probabilities $p_1, \dots, p_n$, the {\bf expectation}\index{expectation}
[sometimes also called the {\bf expected value}\index{expected value}] of $X$
is the value
$$
E(X) = \sum p_i\,x_i\ .
$$
\end{definition}
By what we saw just before this definition, we have the following
\begin{fact}\label{fact:RVmean}
The expectation of a discrete RV is the mean of its values over many runs of
the experiment.
\end{fact}
{\it Note:} The attentive reader will have noticed that we dealt above only
with the case of a finite RV, not the case of a countably infinite one. It
turns out that all of the above works quite well in that more complex case as
well, so long as one is comfortable with a bit of mathematical technology
called ``{\it summing an infinite series}.'' We do not assume such a comfort
level in our readers at this time, so we shall pass over the details of
expectations of infinite, discrete RVs.
\begin{example}\label{eg:expect1}
Let's compute the expectation of net profit RV $X$ in the silly betting game of
Example~\ref{eg:RV1}, whose distribution we computed in
Example~\ref{eg:distribution1}. Plugging straight into the definition, we
see
$$
E(X)=\sum p_i\,x_i = \frac12\cdot5 + \frac12\cdot(-5)=2.5-2.5 = 0 \ .
$$
In other words, your average net gain playing this silly game many times will
be {\bf zero}. Note that does not mean anything like ``{\it if you lose
enough times in a row, the chances of starting to win again will go up},'' as
many gamblers seem to believe, it just means that, in the very long run, we
can expect the average winnings to be approximately zero -- but no one knows
how long that run has to be before the balancing of wins and losses
happens\footnote{In fact, in a very precise sense which we will not discuss in
this book, the longer you play a game like this, the more you can expect there
will be short-term, but very large, wins and losses.}.
\end{example}
A more interesting example is
\begin{example}\label{eg:expect2}
In Example~\ref{eg:distribution2} we computed the distribution of the random
variable $X=\text{``{\it sum of the face values}''}$ on the
``{\it roll two fair dice, independently}'' random experiment from
Example~\ref{eg:RV3}. It is therefore easy to plug the values of the
probabilities and RV values from the distribution table into the formula for
expectation, to get
\begin{align*}
E(X) &=\sum p_i\,x_i\\
&= \frac1{36}\cdot2 + \frac2{36}\cdot3 + \frac3{36}\cdot4
+ \frac4{36}\cdot5 + \frac5{36}\cdot6 + \frac6{36}\cdot7 + \frac5{36}\cdot8
+ \frac4{36}\cdot9 + \frac3{36}\cdot10\\
&\hphantom{= \frac1{36}\cdot2 + \frac2{36}\cdot3 + \frac3{36}\cdot4 + \frac4{36}\cdot5 + \frac5{36}\cdot6 + \frac6{36}\cdot7 + \frac5{36}\cdot8\ } +
\frac2{36}\cdot11 + \frac1{36}\cdot12\\
&= \frac{2\cdot1 + 3\cdot2 + 4\cdot3 + 5\cdot4 + 6\cdot5 + 7\cdot6 + 8\cdot5
+ 9\cdot4 + 10\cdot3 + 11\cdot2 + 12\cdot1}{36}\\
&= 7
\end{align*}
So if you roll two fair dice independently and add the numbers which come up,
then do this process many times and take the average, in the long run that
average will be the value $7$.
\end{example}
\subsection{Density Functions for Continuous RVs}\label{ssec:DF4CRVs}
What about continuous random variables? Definition~\ref{def:RVdistribution} of
{\it distribution}\index{distribution} explicitly excluded the case of
continuous RVs, so does that mean we cannot do probability calculations in that
case?
There is, when we think about it, something of a problem here. A distribution
is supposed to be a list of possible values of the RV and the probability of
each such value. But if some continuous RV has values which are an interval
of real numbers, there is just no way to list all such numbers -- it has been
known since the late 1800s that there is no way to make a list like that (see
\cite{wikiCantorsDiagonalArgument}, for a description of a very pretty proof
of this fact). In addition, the chance of some random process producing a real
number that is {\it exactly} equal to some particular value really is zero:
for two real numbers to be precisely equal requires infinite accuracy ... think
of all of those decimal digits, marching off in orderly rows to infinity, which
must match between the two numbers.
Rather than a distribution, we do the following:
\begin{definition}\label{def:densitycurve}
Let $X$ be a continuous random variable whose values are the real interval
$[x_{min},x_{max}]$, where either $x_{min}$ or $x_{max}$ or both may be $\infty$.
A [{\bf probability}\index{probability density function, for a continuous random variable}]
{\bf density function}\index{density function, for a continuous random variable} for $X$ is a function $f(x)$
defined for $x$ in $[x_{min},x_{max}]$, meaning it is a curve with one $y$ value
for each $x$ in that interval, with the property that
$$
P(a72$, since in inches, $6$ feet becomes $72$. As $X$ is a continuous RV,
we must find the area under its density curve, which is the $\rho$ for
$N(69, 2.8)$, between $72$ and $\infty$.
That $\infty$ is a little intimidating, but since the tails of the Normal
distribution are very thin, we can stop measuring area when $x$ is some large
number and we will have missed only a very tiny amount of area, so we will have
a very good approximation. Let's therefore find the area under $\rho$ from
$x=72$ up to $x=1000$. This can be done in many ways:
\begin{itemize}
\item With a wide array of online tools -- just search for ``online normal
probability calculator.'' One of these yields the value $.142$.
\item With a {\bf TI-8x} calculator\index{calculator}, by typing
$$
\text{\bf normalcdf(72, 1000, 69, 2.8)}\index{normalcdf@{\bf normalcdf}, the cumulative Normal distribution on a {\bf TI-8x} calculator}
$$
which yields the value $.1419884174$. The general syntax here is
$$
\text{\bf normalcdf($a$, $b$, $\muX$, $\sigmaX$)}
$$
to find $P(a72)$ where $X$ was
known to be $N(69, 2.8)$. Is 72 one of the numbers for which we should be
looking, to use the Rule? Well, it's greater than $\muX=69$, so we could hope
that it was $\muX+\sigmaX$, $\muX+2\sigmaX$, or $\muX+3\sigmaX$. But values are
\begin{align*}
\muX+\sigmaX&=69+2.8=71.8,\\
\muX+2\sigmaX&=69+5.6=74.6, \text{and}\\
\muX+3\sigmaX&=69+8.4=77.4,
\end{align*}
none of which is what we need.
Well, it is true that $72\approx71.8$, so we could use that fact and accept
that we are only getting an approximate answer -- an odd choice, given the
availability of tools which will give us extremely precise answers, but let's
just go with it for a minute.
Let's see, the above Rule tells us that
$$
P(66.271.8) = 1 - P(66.271.8$'' ($X$ only
takes on one value at a time, so it cannot be simultaneously less than 66.2 and
greater than 71.8), we can use the simple rule for addition of probabilities:
$$
.32 = P(X<66.2\text{\ or\ }X>71.8) = P(X<66.2) + P(X>71.8)\ .
$$
Now, since the density function of the Normal distribution is symmetric
around the line $x=\muX$, the two terms on the right in the above equation are
equal, which means that
$$
P(X>71.8) = \frac12\left(P(X<66.2) + P(X>71.8)\right) = \frac12 .32 = .16\ .
$$
It might help to visualize the symmetry here as the equality of the two shaded
areas in the following graph
\begin{center}
\includegraphics[height=6.2cm,clip]{use68_95_99.7-1pic.eps}
\end{center}
Now, using the fact that $72\approx71.8$, we may say that
$$
P(X>72)\approx P(X>71.8) = .16
$$
which, since we know that in fact $P(X>72)=.1419883859$, is not a completely
terrible approximation.
\end{example}
\begin{example}\label{eg:use68_95_99.7-2}
Let's do one more computation in the context of the heights of adult American
males, as in the immediately above Example~\ref{eg:use68_95_99.7-1}, but now
one in which the 68-95-99.7 Rule\index{68-95-99.7 Rule} gives a more precise answer.
So say we are asked this time what proportion of adult American men are shorter
than 63.4 inches. Why that height, in particular? Well, it's how tall
archaeologists have determined King Tut was in life. [No, that's made up. It's
just a good number for this problem.]
Again, looking through the values $\muX\pm\sigmaX$, $\muX\pm2\sigmaX$, and
$\muX\pm3\sigmaX$, we notice that
$$
63.4=69-5.6=\muX-2\sigmaX\ .
$$
Therefore, to answer what fraction of adult American males are shorter than
63.4 inches amounts to asking what is the value of $P(X<\muX-2\sigmaX)$.
What we know about $\muX\pm2\sigmaX$ is that the probability of $X$ being between
those two values is $P(\muX-2\sigmaX\muX+2\sigmaX$,'' which have the same area by symmetry. Therefore
\begin{align*}
P(X<63.4)&=P(X<\muX-2\sigmaX)\\
&=\frac12\left[P(X<\muX-2\sigmaX)+P(X>\muX+2\sigmaX)\right]\\
&=\frac12P(X<\muX-2\sigmaX\text{\ or\ }X>\muX+2\sigmaX)\text{\ \ \ since they're disjoint}\\
&=\frac12P((\muX-2\sigmaX \mu_0\ ,\ \text{or}\\
H_a: \muX &\neq \mu_0\ ,\\
\end{align*}
where $\mu_0$ is the same specific number as in $H_0$.
\item Gather data from an SRS\index{simple random sample, SRS}\index{SRS, simple random sample} and compute the sample statistic which is best
related to the parameter of interest. {\it For us in this section, that will
always be the sample mean $\overline{X}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}}
\item Compute the following conditional probability
$$
p=P\left(
\begin{matrix}
\text{getting values of the statistic which are as extreme,}\\
\text{or more extreme, as the ones you did get}
\end{matrix}\ \middle|\ H_0\right)\ .
$$
This is called the {\bf $p$-value of the test}\index{p-value@$p$-value of a hypothesis test}.
\item If the $p$-value is sufficiently small -- typically, $p<.05$ or even
$p<.01$ -- announce
\begin{center}
{\it``We reject $H_0$, with $p=\left<\text{number here}\right>$.''}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$}\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}
\end{center}
Otherwise, announce
\begin{center}
{\it``We fail to reject $H_0$, with $p=\left<\text{number here}\right>$.''}\index{We fail to reject the null hypothesis H0@''We fail to reject the null hypothesis $H_0$.''}\index{failure to reject $H_0$}\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}
\end{center}
\item Translate the result just announced into the language of the original
question. As you do this, you can say {\it ``There is strong statistical
evidence\index{strong statistical evidence} that ...''} if the $p$-value is
very small, while you should merely say something like
{\it ``There is evidence that...''} if the $p$-value is small but not
particularly so.
\end{enumerate}
Note that the hypotheses $H_0$\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis} and $H_a$\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis} are {\it statements}, not numbers. So
\textit{\textbf{don't} write something like} $H_0=\muX=17$; you might use
$$
H_0=\text{``$\muX=17$''}
$$
or
$$
H_o: \muX=17
$$
(we always use the latter in this book).
\subsection{How Small is Small Enough, for $p$-values?}\label{ssec:HSiSEfpvs}
Remember how the $p$-value\index{p-value@$p$-value of a hypothesis test} is
defined:
$$
p=P\left(
\begin{matrix}
\text{getting values of the statistic which are as extreme,}\\
\text{or more extreme, as the ones you did get}
\end{matrix}\ \middle|\ H_0\right)\ .
$$
In other words, if the null hypothesis\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis} is true, maybe the behavior we saw with
the sample\index{sample} data would sometimes happen, but if the probability
is very small, it starts to seem that, under the assumption $H_0$ is true,
the sample behavior was a crazy fluke. If the fluke is crazy enough, we
might want simply to say that since the sample behavior actually happened, it
makes us doubt that $H_0$ is true at all.
For example, if $p=.5$, that means that under the assumption $H_0$\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis} is true,
we would see behavior like that of the sample about every other time we take
an SRS\index{simple random sample, SRS}\index{SRS, simple random sample} and compute the sample statistic. Not much of a surprise.
If the $p=.25$, that would still be behavior we would expect to see in about
one out of every four SRSs\index{simple random sample, SRS}\index{SRS, simple random sample}, when the $H_0$\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis} is true.
When $p$ gets down to $.1$, that is still behavior we expect to see about
one time in ten, when $H_0$\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis} is true. That's rare, but we wouldn't want to bet
anything important on it.
Across science, in legal matters, and definitely for medical studies, we
start to reject $H_0$\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$} when $p<.05$. After all, if $p<.05$ and $H_0$
is true, then we would expect to see results as extreme as the ones we saw in
fewer than one SRS\index{simple random sample, SRS}\index{SRS, simple random sample} out of 20.
There is some terminology for these various cut-offs.
\begin{definition}\label{def:siglevel}
When we are doing a hypothesis test and get a $p$-value which satisfies
$p<\alpha$, for some real number $\alpha$, we say the data are
{\bf statistically significant at level $\alpha$}\index{statistically significant, for data in a hypothesis test}. Here the value $\alpha$ is called the
{\bf significance level}\index{significance level} of the test, as in the
phrase {\it ``We reject $H_0$ at significance level $\alpha$,''}\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$} which we would
say if $p<\alpha$.
\end{definition}
\begin{example}\label{eg:rejectionandfailure}
If we did a hypothesis test and got a $p$-value of $p=.06$, we would say about
it that the result was statistically significant\index{statistically significant, for data in a hypothesis test} at the $\alpha=.1$ level, but not
statistically significant at the $\alpha=.05$ level. In other words, we
would say {\it ``We reject the null hypothesis at the $\alpha=.1$ level,''}\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$} but also
{\it ``We fail to reject the null hypothesis at the $\alpha=.05$ level,''}\index{We fail to reject the null hypothesis H0@''We fail to reject the null hypothesis $H_0$.''}\index{failure to reject $H_0$}.
\end{example}
\begin{fact}\label{fact:defaultsiglevel}
The courts in the United States, as well as the majority of standard scientific
and medical tests which do a formal hypothesis test, use the significance
level\index{significance level} of $\alpha=.05$.
In this chapter, when not otherwise specified, we will use that value of
$\alpha=.05$ as a default significance level\index{significance level}\index{default significance level}\index{significance level!default}.
\end{fact}
\begin{example}\label{eg:HT1}
We have said repeatedly in this book that the heights of American males are
distributed like $N(69, 2.8)$. Last semester, a statistics student named
Mohammad Wong said he thought that had to be wrong, and decide to do a study
of the question. MW is a bit shorter than 69 inches, so his conjecture was
that the mean height must be less, also. He measured the heights of all of
the men in his statistics class, and was surprised to find that the average
of those 16 men's heights was 68 inches (he's only 67 inches tall, and he
thought he was typical, at least for his class\footnote{When an experimenter
tends to look for information which supports their prior ideas, it's called
{\bf confirmation bias}\index{confirmation bias} -- MW may have been
experiencing a bit of this bias when he mistakenly thought he was average in
height for his class.}). Does this support his conjecture or not?
Let's do the formal hypothesis test.
The population that makes sense for this study would be all adult American
men today -- MW isn't sure if the claim of American men's heights having a
population mean \index{population mean, $\muX$}\index{mean!population}\index{10750@$\muX$, population mean} of 69 inches was {\it always} wrong, he is just
convinced that it is wrong {\it today}.
The quantitative
variable\index{quantitative variable}\index{variable!quantitative} of interest
on that population is their height, which we'll call $X$.
The parameter of interest is the population mean
$\muX$\index{population mean, $\muX$}\index{mean!population}\index{10750@$\muX$, population mean}.
The two hypotheses then are
\begin{align*}
H_0: \muX &= 69\quad \text{and}\\
H_a: \muX &< 69\ ,
\end{align*}
where the basic idea in the null hypothesis\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis} is that the claim
in this book of men's heights having mean 69 is true, while the new idea which
MW hopes to find evidence for, encoded in alternative
hypothesis\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis}, is that the true mean of
today's men's heights is less than 69 inches (like him).
MW now has to make two bad assumptions: the first is that the 16 students in his
class are an SRS\index{simple random sample, SRS}\index{SRS, simple random sample} drawn from the population of interest; the second, that the population
standard deviation\index{11400@$\sigmaX$, population standard deviation}\index{standard deviation}\index{population standard deviation, $\sigmaX$} of the
heights of individuals in his population of interest is the same as the
population standard deviation of the group of all adult American males asserted
elsewhere in this book, 2.8\,. These are definitely {\bf bad assumptions} --
particularly that MW's male classmates are an SRS of the population of today's
adult American males -- but he has to make them nevertheless in order to
get somewhere.
The sample mean\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} height $\overline{X}$ for MW's SRS\index{simple random sample, SRS}\index{SRS, simple random sample} of size
$n=16$ is $\overline{X}=68$.
MW can now calculate the $p$-value of this test, using the Central Limit
Theorem\index{Central Limit Theorem, CLT}\index{CLT, Central Limit Theorem}.
According to the CLT, the distribution of $\overline{X}$ is
$N(69, 2.8/\sqrt{16})$. Therefore the $p$-value is
$$
p=P\left(
\begin{matrix}
\text{MW would get values of $\overline{X}$ which are as}\\
\text{extreme, or more extreme, as the ones he did get}
\end{matrix}\ \middle|\ H_0\right) = P(\overline{X}<69)\ .
$$
Which, by what we just observed the CLT tells us, is computable by
$$
\text{\bf normalcdf}(-9999, 68, 69, 2.8/\sqrt{16})\index{normalcdf@{\bf normalcdf}, the cumulative Normal distribution on a {\bf TI-8x} calculator}
$$
on a calculator\index{calculator}, or
$$
\text{\tt NORM.DIST(68, 69, 2.8/SQRT(16), 1)}\index{NORM.DIST@{\tt NORM.DIST}, the cumulative Normal distribution in spreadsheets}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}
$$
in a spreadsheet\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}, either of which gives a value around .07656\,.
This means that if MW uses the 5\% significance
level\index{significance level}, as we often
do\index{significance level!default}, the result is not statistically
significant\index{statistically significant, for data in a hypothesis test}.
Only at the much cruder 10\% significance level would MW say that he rejects the null hypothesis\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$}.
In other words, he might conclude his project by saying
\begin{quote}
\hskip-2.5mm{\it ``My research collected data about my conjecture which was statistically
insignificant at the 5\% significance level but the data, significant
at the weaker 10\% level, did indicate that the average height of American men
is less than the 69 inches we were told it is ($p=.07656$).''}
\end{quote}
People who talk to MW about his study should have additional concerns about
his assumptions of having an SRS\index{simple random sample, SRS}\index{SRS, simple random sample} and of the value of the population
standard deviation\index{11400@$\sigmaX$, population standard deviation}\index{standard deviation}\index{population standard deviation, $\sigmaX$}
\end{example}
\subsection{Calculations for Hypothesis Testing of Population Means}
\label{ssec:CfHToPMs}
We put together the ideas in \S\ref{ssec:tFSoHT} above and the conclusions of
the Central Limit
Theorem\index{Central Limit Theorem, CLT}\index{CLT, Central Limit Theorem}
to summarize what computations are necessary to perform:
\begin{fact}\label{fact:computationsHT}
Suppose we are doing a formal hypothesis test\index{hypothesis test} with
variable $X$ and parameter of interest the population mean $\muX$\index{population mean, $\muX$}\index{mean!population}\index{10750@$\muX$, population mean}.
Suppose that somehow we know the population standard
deviation\index{11400@$\sigmaX$, population standard deviation}\index{standard deviation}\index{population standard deviation, $\sigmaX$} $\sigma_X$ of $X$.
Suppose the null hypothesis\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis} is
$$
H_0: \muX = \mu_0
$$
where $\mu_0$ is a specific number. Suppose also that we have an SRS\index{simple random sample, SRS}\index{SRS, simple random sample} of size
$n$ which yielded the sample mean\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} $\overline{X}$. Then exactly
one of the following three situations will apply:
\begin{enumerate}
\item If the alternative hypothesis\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis} is
$H_a:\muX<\mu_0$ then the $p$-value of the test can be calculated in any of
the following ways
\begin{enumerate}
\item the area to the left of $\overline{X}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} under the graph of a $N(\mu_0, \sigma_X/\sqrt{n})$ distribution,
\item {\bf normalcdf}$(-9999, \overline{X}, \mu_0, \sigma_X/\sqrt{n})$\index{normalcdf@{\bf normalcdf}, the cumulative Normal distribution on a {\bf TI-8x} calculator} on a calculator\index{calculator}, or
\item {\tt NORM.DIST($\overline{X}$, $\mu_0$, $\sigma_X$/SQRT($n$), 1)}\index{NORM.DIST@{\tt NORM.DIST}, the cumulative Normal distribution in spreadsheets}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]} on a spreadsheet\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}.
\end{enumerate}
\item If the alternative hypothesis\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis} is
$H_a:\muX>\mu_0$ then the $p$-value of the test can be calculated in any of the
following ways
\begin{enumerate}
\item the area to the right of $\overline{X}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} under the graph of a $N(\mu_0, \sigma_X/\sqrt{n})$ distribution,
\item {\bf normalcdf}$(\overline{X}, 9999, \mu_0, \sigma_X/\sqrt{n})$\index{normalcdf@{\bf normalcdf}, the cumulative Normal distribution on a {\bf TI-8x} calculator} on a calculator\index{calculator}, or
\item {\tt 1-NORM.DIST($\overline{X}$, $\mu_0$, $\sigma_X$/SQRT($n$), 1)}\index{NORM.DIST@{\tt NORM.DIST}, the cumulative Normal distribution in spreadsheets}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]} on a spreadsheet\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}.
\end{enumerate}
\item If the alternative hypothesis\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis} is
$H_a:\muX\neq \mu_0$ then the $p$-value of the test can be found by using
the approach in exactly one of the following three situations:
\begin{enumerate}
\item If $\overline{X}<\mu_0$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} then $p$ is calculated by any of
the following three ways:
\begin{enumerate}
\item two times the area to the left of $\overline{X}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} under the graph of a
$N(\mu_0, \sigma_X/\sqrt{n})$ distribution,
\item 2\,*\,{\bf normalcdf}$(-9999, \overline{X}, \mu_0, \sigma_X/\sqrt{n})$\index{normalcdf@{\bf normalcdf}, the cumulative Normal distribution on a {\bf TI-8x} calculator} on a calculator\index{calculator}, or
\item {\tt 2\,*\,NORM.DIST($\overline{X}$, $\mu_0$, $\sigma_X$/SQRT($n$), 1)}\index{NORM.DIST@{\tt NORM.DIST}, the cumulative Normal distribution in spreadsheets}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]} on a spreadsheet\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}.
\end{enumerate}
\item If $\overline{X}>\mu_0$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} then $p$ is calculated by any of
the following three ways:
\begin{enumerate}
\item two times the area to the right of $\overline{X}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} under the graph of a
$N(\mu_0, \sigma_X/\sqrt{n})$ distribution,
\item 2\,*\,{\bf normalcdf}$(\overline{X}, 9999, \mu_0, \sigma_X/\sqrt{n})$\index{normalcdf@{\bf normalcdf}, the cumulative Normal distribution on a {\bf TI-8x} calculator} on a calculator\index{calculator}, or
\item {\tt 2\,*\,(1-NORM.DIST($\overline{X}$, $\mu_0$, $\sigma_X$/SQRT($n$), 1))}\index{NORM.DIST@{\tt NORM.DIST}, the cumulative Normal distribution in spreadsheets}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]} on a spreadsheet\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}.
\end{enumerate}
\item If $\overline{X}=\mu_0$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} then $p=1$.
\end{enumerate}
\end{enumerate}
\end{fact}
Note the reason that there is that multiplication by two if the alternative
hypothesis\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis} is $H_a:\muX\neq \mu_0$ is that
there are two directions -- the distribution has two tails -- in which the
values can be more extreme than $\overline{X}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}. For this reason we have
the following terminology:
\begin{definition}\label{def:1and2tailedtests}
If we are doing a hypothesis test\index{hypothesis test} and the alternative
hypothesis\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis} is $H_a:\muX>\mu_0$ or
$H_a:\muX<\mu_0$ then this is called a
{\bf one-tailed test}\index{one-tailed test}. If, instead, the alternative
hypothesis is $H_a:\muX\neq\mu_0$ then this is called a
{\bf two-tailed test}\index{two-tailed test}.
\end{definition}
\begin{example}\label{eg:HT2}
Let's do one very straightforward example of a hypothesis
test\index{hypothesis test}:
A cosmetics company fills its best-selling 8-ounce jars of facial cream by an
automatic dispensing machine. The machine is set to dispense a mean\index{mean}
of 8.1 ounces per jar. Uncontrollable factors in the process can shift the
mean away from 8.1 and cause either underfill or overfill, both of which are
undesirable. In such a case the dispensing machine is stopped and
recalibrated. Regardless of the mean amount dispensed, the standard
deviation\index{standard deviation} of the amount dispensed always has value
.22 ounce. A quality control engineer randomly selects 30 jars from the
assembly line each day to check the amounts filled. One day, the sample
mean\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} is $\overline{X}=8.2$ ounces. Let us see if there is
sufficient evidence in this sample\index{sample} to indicate, at the 1\%
level of significance\index{significance level}, that the machine should be
recalibrated.
The population\index{population of a statistical study} under study is all of
the jars of facial cream on the day of the 8.2 ounce sample.
The variable\index{variable} of interest is the weight $X$ of the jar in ounces.
The population
parameter\index{population parameter}\index{parameter, population} of interest
is the population mean $\muX$\index{population mean, $\muX$}\index{mean!population}\index{10750@$\muX$, population mean} of $X$.
The two hypotheses then are
\begin{align*}
H_0: \muX &= 8.1\quad \text{and}\\
H_a: \muX &\neq 8.1\ .
\end{align*}
The sample mean\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean} is $\overline{X}=8.2$\,, and the sample -- which
we must assume to be an SRS\index{simple random sample, SRS}\index{SRS, simple random sample} -- is of size $n=30$.
Using the case in Fact~\ref{fact:computationsHT} where the alternative hypothesis\index{alternative hypothesis, $H_a$}\index{hypothesis!alternative, $H_a$}\index{10540@$H_a$, alternative hypothesis} is $H_a:\muX\neq \mu_0$ and the sub-case
where $\overline{X}>\mu_0$, we compute the $p$-value by
$$
\text{\tt 2\,*\,(1-NORM.DIST(}8.2, 8.1, .22/\text{\tt SQRT(30)}, 1\text{\tt))}\index{NORM.DIST@{\tt NORM.DIST}, the cumulative Normal distribution in spreadsheets}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}
$$
on a spreadsheet\index{LibreOffice Calc@{\bf LibreOffice Calc}}\index{Calc [LibreOffice]@{\bf Calc} [{\bf LibreOffice}]}\index{Microsoft Excel@{\bf Microsoft Excel}}\index{MS Excel@{\bf MS Excel}}\index{Excel [Microsoft]@{\bf Excel} [{\bf Microsoft}]}, which yields $p=.01278$\,.
Since $p$ is not less than $.01$, we fail to reject $H_0$\index{We fail to reject the null hypothesis H0@''We fail to reject the null hypothesis $H_0$.''}\index{failure to reject $H_0$} at the $\alpha=.01$ level of
significance\index{significance level}.
The quality control engineer should therefore say to company management
\begin{quote}
\hskip-2.5mm{\it ``Today's sample\index{sample}, though off weight, was not
statistically significant\index{statistically significant, for data in a hypothesis test} at the stringent level of significance\index{significance level}
of $\alpha=.01$ that we have chosen to use in these tests, that the jar-filling
machine is in need of recalibration today ($p=.01278$).''}
\end{quote}
\end{example}
\subsection{Cautions}\label{ssec:HTcautions}
As we have seen before, the requirement that the sample we are using in our
hypothesis test\index{hypothesis test} is a valid SRS\index{simple random sample, SRS}\index{SRS, simple random sample} is quite important. But it is also
quite hard to get such a good sample, so this is often something that can be
a real problem in practice, and something which we must assume is true with
often very little real reason.
It should be apparent from the above Facts and Examples that most of the work
in doing a hypothesis test\index{hypothesis test}, after careful initial
set-up, comes in computing the $p$-value.
Be careful of the phrase {\it statistically significant}\index{statistically significant, for data in a hypothesis test}. It does not mean that the effect is
large! There can be a very small effect, the $\overline{X}$\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}
might be very close to $\mu_0$ and yet we might reject the null
hypothesis\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$} if the population
standard deviation\index{11400@$\sigmaX$, population standard deviation}\index{standard deviation}\index{population standard deviation, $\sigmaX$} $\sigma_X$
were sufficiently small, or even if the sample size\index{sample size, $n$}\index{10770@$n$, sample size} $n$ were large enough that $\sigma_X/\sqrt{n}$ became
very small. Thus, oddly enough, a statistically significant result, one where
the conclusion of the hypothesis test\index{hypothesis test} was statistically
quite certain, might not be {\it significant} in the sense of mattering very
much. With enough precision, we can be very sure of small effects.
Note that the meaning of the $p$-value\index{p-value@$p$-value of a hypothesis test} is explained above in its definition as a conditional
probability\index{conditional probability, $P(A\mid B)$}. So $p$ \textit{\textbf{does not}} compute the probability that the null hypothesis\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}
$H_0$ is true, or any such simple thing. In contrast, the
Bayesian\index{Bayesian} approach to probability, which we chose not to use
in the book, in favor of the frequentist approach\index{frequentist approach to probability}, does have a kind of hypothesis test\index{hypothesis test}
which includes something like the direct probability that $H_0$ is true. But
we did not follow the Bayesian approach here because in many other ways it is
more confusing.
In particular, one consequence of the real meaning of the
$p$-value\index{p-value@$p$-value of a hypothesis test} as we use it in this
book is that sometimes we will reject a true null hypothesis $H_0$\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$} just out of bad luck. In fact, if $p$ is just
slightly less than $.05$, we would reject $H_0$ at the $\alpha=.05$ significance
level\index{significance level} even though, in slightly less than one case in
20 (meaning 1 SRS\index{simple random sample, SRS}\index{SRS, simple random sample} out of 20 chosen independently\index{independent events}), we would do this
rejection even though $H_0$ was true.
We have a name for this situation.
\begin{definition}\label{def:typeIerror}
When we reject a true null hypothesis $H_0$\index{null hypothesis, $H_0$}\index{hypothesis!null, $H_0$}\index{10550@$H_0$, null hypothesis}\index{We reject the null hypothesis H0@''We reject the null hypothesis $H_0$.''}\index{rejection of $H_0$} this is called a {\bf type I error}\index{type I error}. Such an error
is usually (but not always: it depends upon how the
population\index{population of a statistical study}, variable\index{variable},
parameter\index{population parameter}\index{parameter, population}, and hypotheses\index{hypothesis} were
set up) a {\bf false positive}\index{false positive}, meaning that something
exciting and new (or scary and dangerous) was found even though it is not
really present in the population.
\end{definition}
\begin{example}\label{eg:TI1}
Let us look back at the cosmetic company with a jar-filling machine from
Example~\ref{eg:HT2}. We don't know what the median\index{median} of the
SRS\index{simple random sample, SRS}\index{SRS, simple random sample} data
was, but it wouldn't be surprising if the data were
symmetric\index{symmetric histogram, dataset, or distribution} and therefore
the median would be the same as the sample mean\index{sample mean, $\overline{x}$}\index{mean!sample}\index{11600@$\overline{x}$, sample mean}
$\overline{X}=8.2$\,. That means that there were at least 15 jars with
8.2 ounces of cream in them, even though the jars are all labelled ``8oz.''
The company is giving way at least $.2\times15=3$ ounces of the very valuable
cream -- in fact, probably much more, since that was simply the overfilling
in that one sample\index{sample}.
So our intrepid quality assurance engineer might well propose to management
to increase the significance level\index{significance level} $\alpha$ of the
testing regime in the factory. It is true that with a larger $\alpha$, it will
be easier for simple randomness\index{randomness} to result in type I
errors\index{type I error}, but unless the recalibration process takes a very
long time (and so results in fewer jars being filled that day), the
cost-benefit analysis probably leans towards fixing the machine slightly too
often, rather than waiting until the evidence is extremely strong it must be
done.
\end{example}
\ \vfill
\pagebreak
\section*{Exercises}
\begin{exercise}
You buy seeds of one particular species to plant in your garden, and the
information on the seed packet tells you that, based on years of experience
with that species, the mean number of days to germination is 22, with
standard deviation 2.3 days.
What is the population and variable in that information? What parameter(s)
and/or statistic(s) are they asserting have particular values? Do you think
they can really know the actual parameter(s) and/or statistic's(s') value(s)?
Explain.
You plant those seeds on a particular day. What is the probability that the
first plant closest to your house will germinate within half a day of the
reported mean number of days to germination -- that is, it will germinate
between 21.5 and 22.5 after planting?
You are interested in the whole garden, where you planted 160 seeds, as well.
What is the probability that the average days to germination of all the plants
in your garden is between 21.5 and 22.5 days? How do you know you can use
the Central Limit Theorem to answer this problem -- what must you assume about
those 160 seeds from the seed packet in order for the CLT to apply?
\end{exercise}
\begin{exercise}
You decide to expand your garden and buy a packet of different seeds. But
the printing on the seed packet is smudged so you can see that the standard
deviation for the germination time of that species of plant is 3.4 days,
but you cannot see what the mean germination time is.
So you plant 100 of these new seeds and note how long each of them takes to
germinate: the average for those 100 plants is 17 days.
What is a 90\% confidence interval for the population mean of the germination
times of plants of this species? Show and explain all of your work. What
assumption must you make about those 100 seeds from the packet in order for
your work to be valid?
What does it mean that the interval you gave had {\it 90\% confidence}\/?
Answer by talking about what would happen if you bought many packets of those
kinds of seeds and planted 100 seeds in each of a bunch of gardens around
your community.
\end{exercise}
\begin{exercise}
An SRS\index{simple random sample, SRS}\index{SRS, simple random sample} of size 120 is taken from the student population at the very large
Euphoria State University [ESU], and their GPAs are computed. The sample
mean GPA is 2.71\,. Somehow, we also know that the population standard
deviation of GPAs at ESU is .51\,. Give a confidence interval at the 90\%
confidence level for the mean GPA of all students at ESU.
You show the confidence interval you just computed to a fellow student who is
not taking statistics. They ask, ``Does that mean that 90\% of students at
ESU have a GPA which is between $a$ and $b$?'' where $a$ and $b$ are the lower
and upper ends of the interval you computed. Answer this question, explaining
why if the answer is {\it yes} and both why not and what is a better way of
explaining this 90\% confidence interval, if the answer is {\it no}.
\end{exercise}
\begin{exercise}
The recommended daily calorie intake for teenage girls is 2200 calories per
day. A nutritionist at Euphoria State University believes the average daily
caloric intake of girls in her state to be lower because of the advertising
which uses underweight models targeted at teenagers. Our nutritionist finds
that the average of daily calorie intake for a random sample of size $n=36$
of teenage girls is 2150.
Carefully set up and perform the hypothesis test for this situation and these
data. You may need to know that our nutritionist has been doing studies for
years and has found that the standard deviation of calorie intake per day in
teenage girls is about 200 calories.
Do you have confidence the nutritionist's conclusions? What does she need
to be careful of, or to assume, in order to get the best possible results?
\end{exercise}
\begin{exercise}
The medication most commonly used today for post-operative pain relieve after
minor surgery takes an average of 3.5 minutes to ease patients' pain, with
a standard deviation of 2.1 minutes. A new drug is being tested which will
hopefully bring relief to patients more quickly. For the test, 50 patients
were randomly chosen in one hospital after minor surgeries. They were given
the new medication and how long until their pain was relieved was timed: the
average in this group was 3.1 minutes. Does this data provide statistically
significant evidence, at the 5\% significance level, that the new drug acts
more quickly than the old?
Clearly show and explain all your set-up and work, of course!
\end{exercise}
\begin{exercise}
The average household size in a certain region several years ago was 3.14
persons, while the standard deviation was .82 persons. A sociologist wishes
to test, at the 5\% level of significance, whether the mean household size is
different now. Perform the test using new information collected by the
sociologist: in a random sample of 75 households this past year, the average
size was 2.98 persons.
\end{exercise}
\begin{exercise}
A medical laboratory claims that the mean turn-around time for performance of
a battery of tests on blood samples is 1.88 business days. The manager of a
large medical practice believes that the actual mean is larger. A random
sample of 45 blood samples had a mean of 2.09 days. Somehow, we know that
the population standard deviation of turn-around times is 0.13 day. Carefully
set up and perform the relevant test at the 10\% level of significance.
Explain everything, of course.
\end{exercise}
\backmatter
\bibliographystyle{amsalpha}
\bibliography{refs}
\printindex
\end{document}
%
% TO DO:
% 1) A section on timeplots and what they possibly are, early on (before
% scatterplots have been introduced).
% 2) Clearly a lot more has to be said about individuals, populations, and
% variables, given the students' persisting confusion.
% 3) Write some exercises which make the students create their own summary
% sheets, e.g., for ``what are the kinds of statistical graphs we know
% about so far, how do they work, in which situations do they apply, and
% how can you recognize them?'' and maybe ``what are the measures of
% central tendency we have so far and what are their principle
% differences?''
% 4) Put Bourbaki windy-road signs in some places?
%