program calhnb(fin, fout, output);
(* calhnb: calculate e(hnb), var(hnb), ae(hnb), avar(hnb)
Dr. Thomas D. Schneider
toms@alum.mit.edu
permanent email: toms@alum.mit.edu (use only if first address fails)
https://alum.mit.edu/www/toms/
module libraries required: delman, prgmods
*)
label 1; (* end of program *)
const
(* begin module version *)
version = 2.29; (* of calhnb.p 2005 Jul 16
2005 Jul 16, 29: calhnb.50.fin and calhnb.50.fout links
2.28, 2003 Jul 23: remove 'entropy'
2.27, 2001 Jun 7: upgrade documentation
2.26, 2001 Jun 7: upgrade documentation
2.25, 2001 Jun 7: add sd(n) column
2.24, 2000 Jan 18: allow approx. for large n.
2.23, upgrade documentation
2.22, 1994 Sep 5: last changes
original before 1984 jan 18 *)
(* end module version *)
{
2001 Jun 7: the original one liner is not too descriptive!
calhnb: calculate e(hnb), var(hnb), ae(hnb), avar(hnb), e(n), sd(n)
}
(* begin module describe.calhnb *)
(*
name
calhnb: small-sample correction for information and uncertainty
synopsis
calhnb(fin: in, fout: out, output: out)
files
fin: the genomic composition (integers) on one line followed by
a set of integers, one per line representing values of n
fout: a table showing n, e(hnb), ae(hnb) and their difference.
the variances var(hnb) and avar(hnb) are tabulated along with
the difference between their square roots. This is the difference
between the standard deviations. e(n) is found from the genomic
uncertainty minus e(hnb). Finally, sd(n) = sqrt(var(hnb)) is given.
output: messages to the user.
describe
Given a genomic composition and a series of integers (n) that represent
the number of sample sites, calhnb calculates the sampling error as e(hnb)
and the variance var(hnb). It also finds the approximations ae(hnb) and
avar(hnb). These values are presented in a table along with the
differences between the exact and approximate calculations. This table
will allow a user to decide when to use the approximations. Beware that
the exact calculation becomes very expensive for large n. For this
reason, I use the approximate computation for n > 20 in rseq and alpro.
examples
When used as fin, the calhnb.fin file should generate the calhnb.fout file
in the fout. The data should be identical those given in Figure A.2 on
page 428 of the Appendix of Schneider et al 1986.
documentation
"Information content of binding sites on nucleotide sequences"
T. D. Schneider, G. D. Stormo, L. Gold, and A. Ehrenfeucht
JMB 188:415-431 (1986) [see link below]
see also
{Example input file, fin:} calhnb.fin
{Corresponding output file, fout:} calhnb.fout
{fin file for values up to n = 50:} calhnb.50.fin
{fout file for values up to n = 50:} calhnb.50.fout
{Discussion about correctiing for small sample size:}
https://alum.mit.edu/www/toms/small.sample.correction.html
{Schneider et al. (1986):}
https://alum.mit.edu/www/toms/paper/schneider1986
{related programs:} rseq.p, alpro.p
author
Thomas D. Schneider
bugs
It would be nice to have a generalized algorithm for any number
of symbols.
*)
(* end module describe.calhnb *)
var
fin, (* input file of n values one per line *)
fout: (* table of e(hnb), ae(hnb) and variances *)
text;
(* begin module halt *)
procedure halt;
(* stop the program. the procedure performs a goto to the end of the
program. you must have a label:
label 1;
declared, and also the end of the program must have this label:
1: end.
examples are in the module libraries.
this is the only goto in the delila system. *)
begin
writeln(output,' program halt.');
goto 1
end;
(* end module halt version = 'prgmod 3.96 85 mar 18 tds'; *)
(* begin module calehnb *)
procedure calehnb(n: integer;
gna,gnc,gng,gnt: integer;
var hg, ehnb, varhnb: real
(* ; debugging: boolean *));
(* calculate e(hnb) in bits/bp (ehnb) for a number (n) of example sequence
sites. gna to gnt are the composition to use for the genome probabilities
of a to t. the genomic uncertainty hg and the variance var(hnb)
(=varhnb) are also calculated.
if the variable debugging is passed to the procedure then the individual
combinations of hnb are displayed.
note: this procedure should not be broken into smaller
procedures so that it remains efficient.
version = 3.02; of procedure calehnb 1983 nov 23 *)
const
maxsize = 200; (* the largest n allowed *)
accuracy = 10000; (* less than (1/accuracy) bits error is demanded
for the sum of pnb (see variable 'total') at the end of the procedure *)
var
log2: real; (* natural log of 2, used to find log base 2 *)
logn: real; (* log of n *)
nlog2: real; (* n * log2 *)
gn: integer; (* sum of gna..gnt *)
logpa,logpc,logpg,logpt: real; (* logs of genome probabilities *)
(* log of n factorial is the sum of i=1 to n of log(i).
the array below represents these logs up to n *)
logfact: array[0..maxsize] of real;
(* precalculated values of -p*log2(p), where p=nb/n for
nb = 0 .. n. m stands for minus *)
mplog2p: array[0..maxsize] of real;
i: integer; (* index for logfact and mplog2p *)
logi: real; (* natural log of i *)
na, nc, ng, nt: integer; (* numbers of bases in a site *)
done: boolean; (* true when the loop is completed *)
pnb: real; (* multinomial probability of a combination
of na, nc, ng, nt *)
hnb: real; (* uncertainty for a combination of na..nt *)
pnbhnb: real; (* pnb*hnb, an intermediate result *)
sshnb: real; (* sum of squares of hnb *)
(* variables for testing program correctness: *)
total: real; (* sum of pnb over all combinations of na..nt
if this is not 1.00, the program is in error *)
counter: integer; (* counts the number of times through
the loop *)
begin (* calehnb *)
(* prevent access to outside the arrays: *)
if n > maxsize then begin
writeln(output,' procedure calehnb: n > maxsize (',
n:1,'>',maxsize:1,')');
halt
end;
counter := 0;
total := 0.0;
log2 := ln(2);
logn := ln(n);
nlog2 := n * log2;
(* get logs of genome probabilities *)
gn := gna + gnc + gng + gnt;
logpa := ln(gna/gn);
logpc := ln(gnc/gn);
logpg := ln(gng/gn);
logpt := ln(gnt/gn);
(* find genomic uncertainty *)
hg := -(gna*logpa + gnc*logpc + gng*logpg + gnt*logpt)/(gn*log2);
ehnb := 0.0; (* start error uncertainty at zero *)
sshnb := 0.0;
(* make table of log of n factorial up to n
and entropies for nb/n *)
logfact[0] := 0.0; (* factorial(0) = 0 *)
mplog2p[0] := 0.0;
for i := 1 to n do begin
logi := ln(i);
logfact[i] := logfact[i - 1] + logi;
mplog2p[i] := - i * (logi - logn) / nlog2
end;
(* begin by looking at the combination with all a: na = n *)
na := n;
nc := 0;
ng := 0;
nt := 0;
(* the following loop simulates a number of nested loops
of the form:
for b1=a to t do
for b2=b1 to t do
for b3=b2 to t do
...
for bn=b(n-1) to t do ...
the resulting set of variables increase in alphabetic order
since no inner loop variable can have a value less than any
outer loop. the number of times through the inner-most loop
is given by:
o = (n + 1)*(n + 2)*(n + 3)/6
in the case where there are four symbols (a,c,g,t) and n is
the number of nested loops.
a recursive set of loops would be possible, but it
would use up too much memory in practical cases (up to n=150
or higher). a second algorithm sequests the loop variables
into an array and increments them there. however, the goal
is to get all possible combinations for na, nc, ng, nt, where
the sum of these is n. the nested loops provide all the
combinations in alphabetic order, assuring that there can not
be any duplicates. to find nb (one of na..nt) one would look
at which of the variables b1 to bn were of value b. this is
a wasteful operation.
the loop below simulates the array of control variables
by changing each nb directly.
*)
done := false;
repeat
(* pnb is calculated by taking the log of the expression
fact(n) na nc ng nt
pnb = ------------------- pa * pc * pg * pt .
fact(na).. fact(nt)
log(pnb) generates a series of sums, allowing
the calculation to proceed by addition and
multiplication rather than multiplication and
exponentiation. the factorials become tractable
in this way *)
pnb := exp(logfact[n] (* n factorial *)
- logfact[na]
- logfact[nc]
- logfact[ng]
- logfact[nt]
+ na * logpa
+ nc * logpc
+ ng * logpg
+ nt * logpt);
hnb := mplog2p[na]
+ mplog2p[nc]
+ mplog2p[ng]
+ mplog2p[nt];
pnbhnb := pnb * hnb;
ehnb := ehnb + pnbhnb;
sshnb := sshnb + pnbhnb * hnb; (* sum of squares of hnb *)
(* the following section keeps track of the calculation
and writes out the current set of nb. *)
counter := succ(counter);
(* if debugging then begin
write(output,' ',counter:2,' ');
for i := 1 to na do write(output,'a');
for i := 1 to nc do write(output,'c');
for i := 1 to ng do write(output,'g');
for i := 1 to nt do write(output,'t');
write(output,' ',na:3,nc:3,ng:3,nt:3);
writeln(output,' pnb = ',pnb:10:5);
end; *)
total := total + pnb;
(* the remaining portion of this repeat loop generates
the values of na, nc, ng and nt. notice that
there are 7 possibilities at each loop increment.
other than the stop, in each case the sum of
na+nc+ng+nt remains constant (=n). *)
if nt > 0
then begin (* ending on a t - do outer loops *)
if ng > 0
then begin (* turn g into t *)
ng := ng - 1;
nt := nt + 1
end
else if nc > 0
then begin (* turn one c into g,
and all t to g (note ng = 0 initially) *)
nc := nc - 1;
ng := nt + 1;
nt := 0
end
else if na > 0
then begin (* turn one a into c and
all g and t to c. (note ng=nc=0 initially) *)
na := na - 1;
nc := nt + 1;
nt := 0
end
else done := true (* since nt = n *)
end
else begin (* no t - increment innermost loop *)
if ng > 0
then begin (* turn g into t *)
ng := ng - 1;
nt := nt + 1
end
else if nc > 0
then begin (* turn c into g *)
nc := nc - 1;
ng := ng + 1
end
else begin (* na > 0; turn a into c *)
na := na - 1;
nc := nc + 1
end
end
until done;
(* final adjustment: we only have the sum of squares so far *)
varhnb := sshnb - ehnb*ehnb;
(* if this message appears, there is either a bug in the code or
the computer cannot be as accurate as requested *)
if accuracy <> round(accuracy * total) then begin
writeln(output,' procedure calehnb: the sum of probabilities is');
writeln(output,' not accurate to one part in ',accuracy:1);
writeln(output,' the sum of the probabilities is ',total:10:8);
end;
(* if this message appear, then there is an error in the
repeat-until loop: it did not repeat as many times as
is expected from the algorithm *)
if counter <> round((n+1)*(n+2)*(n+3)/6) then begin
writeln(output,' procedure calehnb: program error, the number of');
writeln(output,' calculations is in error');
halt
end;
(* writeln(output, ' total: ',total:10:5);
writeln(output,' count = ',counter:1);
writeln(output,' (n+1)*(n+2)*(n+3)/6 = ',
round((n+1)*(n+2)*(n+3)/6):1); *)
end; (* calehnb *)
(* end module calehnb *)
(* begin module calaehnb *)
procedure calaehnb(n: integer;
gna,gnc,gng,gnt: integer;
var hg, aehnb, avarhnb: real);
(* calculate ae(hnb) in bits/bp (=aehnb) for a number (n) of example sequence
sites. gna to gnt are the composition to use for the genome probabilities
of a to t. the genomic uncertainty (=hg) and the variance avar(hnb) (=avarhnb) are
also calculated.
this procedure gives approximations for e(hnb) and var(hnb).
it is based on a formula derived by jeff haemer.
see also:
G. P. Basharin
Theory Probability Appl. 4(3): 333-336 (1959)
'On a statistical estimate for the entropy of a sequence
of independent random variables'
version = 3.01; of procedure calaehnb 1983 nov 23 *)
var
log2: real; (* natural log of 2 *)
gn: integer; (* sum of gna..gnt *)
pa, pc, pg, pt: real; (* genomic probabilities *)
e: real; (* the approximate sampling error *)
begin (* calaehnb *)
log2 := ln(2);
gn := gna + gnc + gng + gnt;
pa := gna/gn;
pc := gnc/gn;
pg := gng/gn;
pt := gnt/gn;
hg := -( (pa*ln(pa))
+(pc*ln(pc))
+(pg*ln(pg))
+(pt*ln(pt)))/log2;
e := 3/(2*log2*n);
aehnb := hg - e;
avarhnb := e * e
end; (* calaehnb *)
(* end module calaehnb *)
procedure themain(var fin, fout: text);
(* the main procedure of the program *)
const
inf = 9; (* field for information display *)
ind = 5; (* decimal place for information display *)
{
extra = 5; (* more size for final output *)
}
mintell = 25; (* the minimum n for which to tell the user
were we are... *)
maxsize = 200; (* maximum size for computing exactly. This
*must* match the same constant in procedure calehnb. *)
var
gna,gnc,gng,gnt: integer; (* genomic composition *)
n: integer; (* number of example sequences *)
h, ha, (* genomic hg for calehnb or calaehnb procedures *)
ehnb, (* e(hnb) *)
aehnb, (* ae(hnb) *)
varhnb, (* variance of hnb *)
avarhnb: (* approximate variance of hnb *)
real;
flag: boolean; (* for flagging cases that only have approx computation *)
flags: boolean; (* true if flag was ever true *)
power: integer; (* 10 raised to the number of digits displayed (ind).
by multiplying by this number and rounding, one can check the
accuracy of the two genomic uncertainty calculations h and ha.
if this is not done, one can have round-off problems. *)
begin (* themain *)
writeln(output,' calhnb ',version:4:2);
rewrite(fout);
writeln(fout,'* calhnb ',version:4:2,' calculate statistics of hnb');
writeln(fout,'*');
(* obtain the genomic composition *)
reset(fin);
if eof(fin) then begin
writeln(output,' file fin is empty');
halt
end;
readln(fin,gna,gnc,gng,gnt);
writeln(fout,'* genomic composition: ',
' a = ',gna:1,', ',
' c = ',gnc:1,', ',
' g = ',gng:1,', ',
' t = ',gnt:1);
(* find out the genomic uncertainty: *)
calaehnb(1,gna,gnc,gng,gnt,ha,aehnb,avarhnb);
writeln(fout,'* genomic uncertainty, hg = ',ha:inf:ind,' bits');
writeln(output,'* genomic uncertainty, hg = ',ha:inf:ind,' bits');
writeln(fout,'*');
writeln(fout,'* n is the number of sequence examples');
writeln(fout,'* e(hnb) is the expectation of the uncertainty hnb',
' calculated from n examples');
writeln(fout,'* ae(hnb) an approximation of e(hnb) that is',
' calculated');
writeln(fout,'* more rapidly than e(hnb) for large n');
writeln(fout,'* e diff e(hnb)-ae(hnb)');
writeln(fout,'* var(hnb) is the variance of hnb');
writeln(fout,'* avar(hnb) is the approximate variance of hnb');
writeln(fout,'* std diff is the difference between the standard',
' deviations');
writeln(fout,'* (square roots of) var(hnb) and avar(hnb)');
writeln(fout,'* e(n) hg - e(hnb), the sampling error.');
writeln(fout,'* sd(n) square root of var(hnb).');
writeln(fout,'*');
writeln(fout,'* units are bits/base, except for the variances which');
writeln(fout,'* are the square of these.');
writeln(fout,'*');
writeln(fout,'*','n':3,
' ','e(hnb)':inf,
' ','ae(hnb)':inf,
' ','e diff':inf,
' ','var(hnb)':inf,
' ','avar(hnb)':inf,
' ','std diff':inf,
' ','e(n)':inf,
' ','sd(n)':inf);
writeln(fout,'*');
(* ten to the number of digits. see definition of power *)
power := round(exp(ind*ln(10.0)));
flags := false;
while not eof(fin) do begin
readln(fin, n);
if n > mintell then writeln(output,' calculating n = ',n:4);
if n <= maxsize
then begin
calehnb(n, gna,gnc,gng,gnt,h,ehnb,varhnb);
flag := false;
end
else begin
flag := true;
flags := true;
end;
calaehnb(n, gna,gnc,gng,gnt,ha,aehnb,avarhnb);
write(fout,' ',n:3,
' ',ehnb:inf:ind,
' ',aehnb:inf:ind,
' ',(ehnb-aehnb):inf:ind);
write(fout,' ',varhnb:inf:ind,
' ',avarhnb:inf:ind,
' ',(sqrt(varhnb) - sqrt(avarhnb)):inf:ind,
' ',(h - ehnb):inf:ind);
write(fout,' ',sqrt(varhnb):inf:ind);
if flag then begin
write(fout, ' *');
end
else if round(power * h) <> round(power * ha) then begin
writeln(output,' program error in procedure themain ',
'h <> ha (',h:inf:ind,'<>',ha:inf:ind,')');
halt
end;
writeln(fout)
end;
if flags then begin
writeln(fout,'*');
writeln(fout, '* for these, only approximate values were computed');
writeln(fout, 'last values in scientific notation:');
{
write(fout,' ',n:3,
' ',ehnb:inf+extra:ind+extra,
' ',aehnb:inf+extra:ind+extra,
' ',(ehnb-aehnb):inf+extra:ind+extra);
write(fout,' ',varhnb:inf+extra:ind+extra,
' ',avarhnb:inf+extra:ind+extra,
' ',(sqrt(varhnb) - sqrt(avarhnb)):inf+extra:ind+extra,
' ',(h - ehnb):inf+extra:ind+extra);
}
write(fout,' ',n:3,
' ',ehnb:inf,
' ',aehnb:inf,
' ',(ehnb-aehnb):inf);
write(fout,' ',varhnb:inf,
' ',avarhnb:inf,
' ',(sqrt(varhnb) - sqrt(avarhnb)):inf,
' ',(h - ehnb):inf);
writeln(fout,'*');
end;
end; (* themain *)
begin (* calhnb *)
themain(fin, fout);
1: end. (* calhnb *)