Skip to content

Commit

Permalink
cleaned
Browse files Browse the repository at this point in the history
  • Loading branch information
behrica committed Nov 3, 2024
1 parent ee5256b commit 4e7cded
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 47 deletions.
43 changes: 10 additions & 33 deletions src/scicloj/ml/xgboost.clj
Original file line number Diff line number Diff line change
Expand Up @@ -204,76 +204,53 @@ subsample may be set to as low as 0.1 without loss of model accuracy. Note that
;; (def feature-ds feature-ds)
;; (def target-ds target-ds)
;(println :n-features (tc/row-count feature-ds))
(let [ds (if (not (empty? target-ds))
(let [ds (if (seq? target-ds)
(assoc feature-ds :label (:label target-ds))
feature-ds)

_ (def ds ds)
zero-baseddocs-map
(zipmap
(-> ds :document distinct)
(range))


;n-col (inc (apply max (ds :token-idx)))

_ (def zero-baseddocs-map zero-baseddocs-map)
bow-zeroed
(-> ds
(tc/select-columns [:document :token-idx text-feature-column])
(tc/add-or-replace-column
:document-zero-based
#(map zero-baseddocs-map (:document %))))
_ (def bow-zeroed bow-zeroed)
bow-zeroed
(-> ds
(tc/select-columns [:document :token-idx text-feature-column])
(tc/add-or-replace-column
:document-zero-based
#(map zero-baseddocs-map (:document %))))

sparse-features
(-> bow-zeroed
(tc/select-columns [:document-zero-based :token-idx text-feature-column])
(tc/order-by [:document-zero-based :token-idx])
(tc/rows))



_ (def sparse-features sparse-features)

_ (println :n-col n-col)

csr (csr/->csr sparse-features)

;_ (println :max-column-index+1 (inc (apply max (:column-indices csr))))

_ (def csr csr)
_ (def n-col n-col)

labels
(->
ds
(tc/group-by :document)
(tc/aggregate #(-> % :label first))
(tc/column "summary"))

_ (def labels labels)

m
(DMatrix.
(long-array (:row-pointers csr))
(int-array (:column-indices csr))
(float-array (:values csr))
DMatrix$SparseType/CSR
n-col)

_ (def m m)
]
n-col)]
(when ( seq target-ds)
(.setLabel m (float-array labels)))
{:dmatrix m
:dmatrix-order
(-> bow-zeroed
(tc/select-columns [:document :document-zero-based])
(tc/unique-by [:document :document-zero-based])
(tc/rename-columns {:document-zero-based :row-nr})
)

}))
(tc/rename-columns {:document-zero-based :row-nr}))}))


(defn- dataset->labeled-point-iterator
Expand Down
1 change: 0 additions & 1 deletion src/scicloj/ml/xgboost/csr.clj
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
:row-pointers new-row-pointers})))

(defn ->csr [r-c-vs]
(println :->csr :count (count r-c-vs))
;; data gets sorted by r and c
;; not sure, if good idea for performace ?

Expand Down
8 changes: 2 additions & 6 deletions src/scicloj/ml/xgboost/model.clj
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,12 @@
;; attention: this function might be smile specific
;; it assumes a certain relation in the order of prediction probbalilities in `cls-tens`
;; and teh categoricla map

[cls-tens target-cname target-categorical-maps]
(def cls-tens cls-tens)
(def target-categorical-maps target-categorical-maps)

(let [rename-map (-> (get-in target-categorical-maps
[target-cname :lookup-table])
(set/map-invert))
;n-cols (count rename-map)
]
(set/map-invert))]
(-> (dtt/reshape cls-tens (dtype/shape cls-tens))
(ds-tens/tensor->dataset)
(ds/rename-columns rename-map)
Expand Down
1 change: 0 additions & 1 deletion test/scicloj/ml/text_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@
:tf
n-sparse-columns)

_ (def m-train m-train)

model
(xgboost/train-from-dmatrix
Expand Down
10 changes: 4 additions & 6 deletions test/scicloj/ml/xgboost_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@
[scicloj.metamorph.ml.verify :as verify]
[scicloj.ml.smile.discrete-nb :as nb]
[scicloj.ml.smile.nlp :as nlp]
[fastmath.core :as fm]
[scicloj.ml.xgboost]
[tablecloth.api :as tc]
[tech.v3.dataset :as ds]
[tech.v3.dataset.categorical :as ds-cat]
[tech.v3.dataset.column-filters :as cf]
[tech.v3.dataset.modelling :as ds-mod]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dfn]
[scicloj.ml.xgboost.model :as model]
[tech.v3.datatype.functional :as fun])
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dfn])
(:import [java.util.zip GZIPInputStream]))


Expand Down Expand Up @@ -79,7 +78,6 @@
:sparse-column :bow-sparse
:n-sparse-columns 100})


explanation (ml/explain model)
test-ds (ds/head reviews 100)
prediction (ml/predict test-ds model)
Expand All @@ -91,6 +89,7 @@
(-> test-ds
(ds-cat/reverse-map-categorical-xforms)
:Score))]
(is (fm/approx= 0.672 (second (first (tc/rows explanation)))))
(is (> train-acc 0.97))))


Expand All @@ -99,7 +98,6 @@
ds (-> src-ds
(ds/categorical->number cf/categorical)
(ds-mod/set-inference-target "species"))
feature-ds (cf/feature ds)
split-data (ds-mod/train-test-split ds {:seed 12345})
train-ds (:train-ds split-data)
test-ds (:test-ds split-data)
Expand Down

0 comments on commit 4e7cded

Please sign in to comment.