Don't fit intercept for rel. abun. calculation

This prevents unusual low relative abundance assigned to certain strains because their k-mers were captured by the intercept. Furthermore, the `minmax_scale` doesn't distort the k-mer count distribution as `scale` did.
broadinstitute · Apr 14, 2021 · 7a82893 · 7a82893
1 parent 3435c15
commit 7a82893
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/src/strainge/search_tool.py b/src/strainge/search_tool.py
@@ -36,7 +36,7 @@
 
 import h5py
 import numpy as np
-from sklearn.preprocessing import scale
+from sklearn.preprocessing import minmax_scale
 from sklearn.linear_model import LinearRegression
 
 from strainge import kmertools, kmerizer
@@ -320,11 +320,11 @@ def calc_relative_abundance(self, sample, result):
             ref_kmerset.intersect(sample.kmers)
 
         kmers, matrix = kmertools.build_kmer_count_matrix([sample, *ref_kmersets])
-        matrix = scale(matrix)
+        matrix = minmax_scale(matrix)
         sample_abun = matrix[:, 0]
         strain_matrix = matrix[:, 1:]
 
-        model = LinearRegression(positive=True)
+        model = LinearRegression(fit_intercept=False, positive=True)
         model.fit(strain_matrix, sample_abun)
         summed_weights = model.coef_.sum()