Packt+ | Advance your knowledge in tech

You're reading from Go Machine Learning Projects Eight projects demonstrating end-to-end machine learning and predictive analytics applications in Go

Product type Paperback

Published in Nov 2018

Publisher Packt

ISBN-13 9781788993401

Length 348 pages

Edition 1st Edition

Languages

Tools

OpenCV

Concepts

Machine Learning

Author (1):

Xuanyi Chew

View More author details

Once we've done that, we may move the previous main() into a different function, leaving ourselves with a blank canvas for main() again. We're now ready for the meat of the program. This is a skeleton program. You're encouraged to actually actively change the program while writing this:

func main() {
 f, err := os.Open("dev.json")
 dieIfErr(err)
 tweets := load(f)
 p := newProcessor()
 tweets = p.process(tweets)
expC := 20
 distances, last := knn(asMatrix(tweets), expC, clusters.EuclideanDistance)
 log.Printf("distances %v | %v", distances, last)
// plot for DBSCAN elbows
 plt, err := plot.New()
 dieIfErr(err)
 plotutil.AddLinePoints(plt, "KNN Distance", plotKNNDist(last))
 plt.Save(25*vg.Centimeter, 25*vg.Centimeter, "KNNDist.png")
// actually do the clustering
 dmmClust := dmm(tweets, expC, p.corpus.Size())
 kmeansClust := kmeans(tweets, expC)
 dbscanClust, clustCount := dbscan(tweets)
// print output
 log.Printf("len(tweets)%d", len(tweets))
 var buf bytes.Buffer
bc := byClusters2(dmmClust, expC)
 lc, tweetCount := largestCluster2(dmmClust)
 fmt.Fprintf(&buf, "Largest Cluster %d - %d tweets\n", lc, tweetCount)
 for i, t := range bc {
 fmt.Fprintf(&buf, "CLUSTER %d: %d\n", i, len(t))
 for _, c := range t {
 fmt.Fprintf(&buf, "\t%v\n", tweets[c].clean2)
 }
 }
 fmt.Fprintf(&buf, "==============\n")
 bc2 := byClusters(kmeansClust, expC)
 for i, t := range bc2 {
 fmt.Fprintf(&buf, "CLUSTER %d: %d\n", i, len(t))
 for _, c := range t {
 fmt.Fprintf(&buf, "\t%v\n", tweets[c].clean2)
 }
 }
 fmt.Fprintf(&buf, "==============\n")
 bc3 := byClusters(dbscanClust, clustCount)
 for i, t := range bc3 {
 fmt.Fprintf(&buf, "CLUSTER %d: %d\n", i, len(t))
 for _, c := range t {
 fmt.Fprintf(&buf, "\t%v\n", tweets[c].clean2)
 }
 }
log.Println(buf.String())
 }

There are some utility functions that I have yet to show you. Now it's time to define them:

 func dmm(a []*processedTweet, expC int, corpusSize int) []dmmclust.Cluster {
 conf := dmmclust.Config{
 K: expC,
 Vocabulary: corpusSize,
 Iter: 1000,
 Alpha: 0.0,
 Beta: 0.01,
 Score: dmmclust.Algorithm4,
 Sampler: dmmclust.NewGibbs(rand.New(rand.NewSource(1337))),
 }
 dmmClust, err := dmmclust.FindClusters(toDocs(a), conf)
 dieIfErr(err)
 return dmmClust
 }
func kmeans(a []*processedTweet, expC int) []int {
 // create a clusterer
 kmeans, err := clusters.KMeans(100000, expC, clusters.EuclideanDistance)
 dieIfErr(err)
 data := asMatrix(a)
 dieIfErr(kmeans.Learn(data))
 return kmeans.Guesses()
 }
func dbscan(a []*processedTweet) ([]int, int) {
 dbscan, err := clusters.DBSCAN(5, 0.965, 8, clusters.EuclideanDistance)
 dieIfErr(err)
 data := asMatrix(a)
 dieIfErr(dbscan.Learn(data))
 clust := dbscan.Guesses()
counter := make(map[int]struct{})
 for _, c := range clust {
 counter[c] = struct{}{}
 }
 return clust, len(counter)
 }
func largestCluster(clusters []int) (int, int) {
 cc := make(map[int]int)
 for _, c := range clusters {
 cc[c]++
 }
var retVal, maxVal int
for k, v := range cc {
 if v > maxVal {
 retVal = k
 maxVal = v
 }
 }
 return retVal, cc[retVal]
 }
func largestCluster2(clusters []dmmclust.Cluster) (int, int) {
 cc := make(map[int]int)
 for _, c := range clusters {
 cc[c.ID()]++
 }
var retVal, maxVal int
for k, v := range cc {
 if v > maxVal {
 retVal = k
 maxVal = v
 }
 }
 return retVal, cc[retVal]
 }
func byClusters(a []int, expectedClusters int) (retVal [][]int) {
 if expectedClusters == 0 {
 return nil
 }
 retVal = make([][]int, expectedClusters)
 var i, v int
 defer func() {
 if r := recover(); r != nil {
 log.Printf("exp %v | %v", expectedClusters, v)
 panic(r)
 }
 }()
 for i, v = range a {
 if v == -1 {
 // retVal[0] = append(retVal[0], i)
 continue
 }
 retVal[v-1] = append(retVal[v-1], i)
 }
 return retVal
 }
func byClusters2(a []dmmclust.Cluster, expectedClusters int) (retVal [][]int) {
 retVal = make([][]int, expectedClusters)
 for i, v := range a {
 retVal[v.ID()] = append(retVal[v.ID()], i)
 }
 return retVal
 }

These are some of the utility functions that may be found in utils.go. They mainly help with tweaking the program. Now run the program by typing go run *.go.