138
138
from bisect import bisect_left , bisect_right
139
139
from math import hypot , sqrt , fabs , exp , erf , tau , log , fsum
140
140
from operator import mul
141
- from collections import Counter , namedtuple
141
+ from collections import Counter , namedtuple , defaultdict
142
142
143
143
_SQRT2 = sqrt (2.0 )
144
144
@@ -202,6 +202,43 @@ def _sum(data):
202
202
return (T , total , count )
203
203
204
204
205
+ def _ss (data , c = None ):
206
+ """Return sum of square deviations of sequence data.
207
+
208
+ If ``c`` is None, the mean is calculated in one pass, and the deviations
209
+ from the mean are calculated in a second pass. Otherwise, deviations are
210
+ calculated from ``c`` as given. Use the second case with care, as it can
211
+ lead to garbage results.
212
+ """
213
+ if c is not None :
214
+ T , total , count = _sum ((d := x - c ) * d for x in data )
215
+ return (T , total , count )
216
+ count = 0
217
+ sx_partials = defaultdict (int )
218
+ sxx_partials = defaultdict (int )
219
+ T = int
220
+ for typ , values in groupby (data , type ):
221
+ T = _coerce (T , typ ) # or raise TypeError
222
+ for n , d in map (_exact_ratio , values ):
223
+ count += 1
224
+ sx_partials [d ] += n
225
+ sxx_partials [d ] += n * n
226
+ if not count :
227
+ total = Fraction (0 )
228
+ elif None in sx_partials :
229
+ # The sum will be a NAN or INF. We can ignore all the finite
230
+ # partials, and just look at this special one.
231
+ total = sx_partials [None ]
232
+ assert not _isfinite (total )
233
+ else :
234
+ sx = sum (Fraction (n , d ) for d , n in sx_partials .items ())
235
+ sxx = sum (Fraction (n , d * d ) for d , n in sxx_partials .items ())
236
+ # This formula has poor numeric properties for floats,
237
+ # but with fractions it is exact.
238
+ total = (count * sxx - sx * sx ) / count
239
+ return (T , total , count )
240
+
241
+
205
242
def _isfinite (x ):
206
243
try :
207
244
return x .is_finite () # Likely a Decimal.
@@ -399,13 +436,9 @@ def mean(data):
399
436
400
437
If ``data`` is empty, StatisticsError will be raised.
401
438
"""
402
- if iter (data ) is data :
403
- data = list (data )
404
- n = len (data )
439
+ T , total , n = _sum (data )
405
440
if n < 1 :
406
441
raise StatisticsError ('mean requires at least one data point' )
407
- T , total , count = _sum (data )
408
- assert count == n
409
442
return _convert (total / n , T )
410
443
411
444
@@ -776,41 +809,6 @@ def quantiles(data, *, n=4, method='exclusive'):
776
809
777
810
# See http://mathworld.wolfram.com/Variance.html
778
811
# http://mathworld.wolfram.com/SampleVariance.html
779
- # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
780
- #
781
- # Under no circumstances use the so-called "computational formula for
782
- # variance", as that is only suitable for hand calculations with a small
783
- # amount of low-precision data. It has terrible numeric properties.
784
- #
785
- # See a comparison of three computational methods here:
786
- # http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
787
-
788
- def _ss (data , c = None ):
789
- """Return sum of square deviations of sequence data.
790
-
791
- If ``c`` is None, the mean is calculated in one pass, and the deviations
792
- from the mean are calculated in a second pass. Otherwise, deviations are
793
- calculated from ``c`` as given. Use the second case with care, as it can
794
- lead to garbage results.
795
- """
796
- if c is not None :
797
- T , total , count = _sum ((d := x - c ) * d for x in data )
798
- return (T , total )
799
- T , total , count = _sum (data )
800
- mean_n , mean_d = (total / count ).as_integer_ratio ()
801
- partials = Counter ()
802
- for n , d in map (_exact_ratio , data ):
803
- diff_n = n * mean_d - d * mean_n
804
- diff_d = d * mean_d
805
- partials [diff_d * diff_d ] += diff_n * diff_n
806
- if None in partials :
807
- # The sum will be a NAN or INF. We can ignore all the finite
808
- # partials, and just look at this special one.
809
- total = partials [None ]
810
- assert not _isfinite (total )
811
- else :
812
- total = sum (Fraction (n , d ) for d , n in partials .items ())
813
- return (T , total )
814
812
815
813
816
814
def variance (data , xbar = None ):
@@ -851,12 +849,9 @@ def variance(data, xbar=None):
851
849
Fraction(67, 108)
852
850
853
851
"""
854
- if iter (data ) is data :
855
- data = list (data )
856
- n = len (data )
852
+ T , ss , n = _ss (data , xbar )
857
853
if n < 2 :
858
854
raise StatisticsError ('variance requires at least two data points' )
859
- T , ss = _ss (data , xbar )
860
855
return _convert (ss / (n - 1 ), T )
861
856
862
857
@@ -895,12 +890,9 @@ def pvariance(data, mu=None):
895
890
Fraction(13, 72)
896
891
897
892
"""
898
- if iter (data ) is data :
899
- data = list (data )
900
- n = len (data )
893
+ T , ss , n = _ss (data , mu )
901
894
if n < 1 :
902
895
raise StatisticsError ('pvariance requires at least one data point' )
903
- T , ss = _ss (data , mu )
904
896
return _convert (ss / n , T )
905
897
906
898
@@ -913,12 +905,9 @@ def stdev(data, xbar=None):
913
905
1.0810874155219827
914
906
915
907
"""
916
- if iter (data ) is data :
917
- data = list (data )
918
- n = len (data )
908
+ T , ss , n = _ss (data , xbar )
919
909
if n < 2 :
920
910
raise StatisticsError ('stdev requires at least two data points' )
921
- T , ss = _ss (data , xbar )
922
911
mss = ss / (n - 1 )
923
912
if issubclass (T , Decimal ):
924
913
return _decimal_sqrt_of_frac (mss .numerator , mss .denominator )
@@ -934,12 +923,9 @@ def pstdev(data, mu=None):
934
923
0.986893273527251
935
924
936
925
"""
937
- if iter (data ) is data :
938
- data = list (data )
939
- n = len (data )
926
+ T , ss , n = _ss (data , mu )
940
927
if n < 1 :
941
928
raise StatisticsError ('pstdev requires at least one data point' )
942
- T , ss = _ss (data , mu )
943
929
mss = ss / n
944
930
if issubclass (T , Decimal ):
945
931
return _decimal_sqrt_of_frac (mss .numerator , mss .denominator )
0 commit comments