#!/usr/bin/julia # Real men run their classifiers on data that hasn't been used to train the classifier in the first place. # Make sure we've got the necessary libraries installed Pkg.add("DataFrames") Pkg.add("Images") Pkg.add("ImageView") Pkg.add("DecisionTree") # And loaded require("DataFrames") require("Images") require("ImageView") require("DecisionTree") # Where the data is (Kaggle provide two zip files that unpack into these directories) trainlabelsfile="./trainLabels.csv" trainimagedirectory="./trainResized/" testlabelsfile="./sampleSubmission.csv" testimagedirectory="./testResized/" # All the resized images are 20x20 imageSize=20*20 # Get the data sets and turn them into an index of file numbers and a # large array of image data where each image is reduced to a greyscale # 1x400 vector. This is hella slow. I was promised speed! function readimages(csvfile, imagedirectory) #read the csv file labels=DataFrames.readtable(csvfile) no_of_images=size(labels)[1] @printf("Read in %d labels from %s\n", no_of_images, csvfile) # read the images, grey them, flatten them, and put them in an array x=zeros(no_of_images,imageSize) for (a,b) in enumerate(labels[:ID]); image="$(imagedirectory)$(b).Bmp" img=Images.imread(image) img_gs=convert(Images.Image{Images.Gray},img) img_floats=reinterpret(Float32,float32(img_gs)) img_vec=reshape(img_floats,1,imageSize) @printf("%s %s\n",a,image) x[a,:]=img_vec end return labels,x end; trainlabels,trainimages=readimages(trainlabelsfile, trainimagedirectory) # Our classifier can't deal with non-numeric class labels # So to use it we convert the ground truth labels like "A" into numbers like 65 # 1963 did phone, but I forgot to warn them trainlabelsbodge=int(map((x -> int(x[1])), trainlabels[:Class])) # Let's just check that we're still sane char(trainlabelsbodge[20]) # should be a k ImageView.view(Images.grayim(reshape(trainimages[20,:],20,20))) # should be a picture of a k # All this preparation having been done, we can now feed the data into # a random-forest making function: sherwood=DecisionTree.build_forest(trainlabelsbodge,trainimages,20,50,1.0) #again, bloody ages ## Ensemble of Decision Trees ## Trees: 50 ## Avg Leaves: 2208.14 ## Avg Depth: 19.3 # Now, how well does this forest do on the data on which it was trained? @printf("Training Random Forest\n") shouldbegoodbodge=DecisionTree.apply_forest(sherwood, trainimages) shouldbegood=map((x->string(char(x))),shouldbegoodbodge) # On its training data, it only got one wrong wrong=find(shouldbegood.!=trainlabels[:Class]) # 3055 @printf("Testing a Random Forest on the data used to train it: errors=%s\n", size(wrong)[1]) # We can also try it on the test data @printf("Reading in the test images\n") testlabels,testimages=readimages(testlabelsfile, testimagedirectory) @printf("Classifying the test images\n") doesitworkbodge=DecisionTree.apply_forest(sherwood, testimages) doesitwork=map((x->string(char(x))),doesitworkbodge) # Rather embarrassingly, I can't tell what this is ImageView.view(Images.grayim(reshape(testimages[1,:],20,20))) # The classifier's thinks it's an H, which is reasonable doesitwork[1] #"H" # E, for defs ImageView.view(Images.grayim(reshape(testimages[2,:],20,20))) doesitwork[2] #"E" # Christ on a bike ImageView.view(Images.grayim(reshape(testimages[3,:],20,20))) doesitwork[3] #"7" # This is a P on its side. They're cheating! ImageView.view(Images.grayim(reshape(testimages[4,:],20,20))) doesitwork[4] #"O" # Anyhow, we can replace the dummy labels in our test labels file (I know, I know..) testlabels[:Class]=doesitwork # And write it back out for submission to Kaggle DataFrames.writetable("doom.csv",testlabels) # Kaggle learn me that I've scored 44%, and am now 26th out of 39 in the competition.
Learning Julia
Thursday 8 January 2015
Julia (julialang) / Kaggle Tutorial : Train and Test Random Forest
Wednesday 7 January 2015
Julia (julialang) Kaggle Tutorial : Running a Random Forest Classifier
#!/usr/bin/julia # Right, let's see if we can actually get Julia to do a bit of Machine Learning: # Make sure we've got the necessary libraries installed Pkg.add("DataFrames") Pkg.add("Images") Pkg.add("ImageView") Pkg.add("DecisionTree") # And loaded require("DataFrames") require("Images") require("ImageView") require("DecisionTree") # Now let's load our training data labelsfile="./trainLabels.csv" imagedirectory="./trainResized/" imageSize=20*20 # load the ground truth labels labels=DataFrames.readtable(labelsfile) no_of_images=size(labels)[1] @printf("Read in %d labels from %s\n", no_of_images, labelsfile) # Create a gigantic array of training images # This is hella slow. I was promised speed! train=zeros(no_of_images,imageSize) for (a,b) in enumerate(labels[:ID]); image="$(imagedirectory)$(b).Bmp" img=Images.imread(image) assert(size(img)==(20,20)) # paranoia img_gs=convert(Images.Image{Images.Gray},img) assert(size(img_gs)==(20,20)) img_floats=reinterpret(Float32,float32(img_gs)) assert(size(img_floats)==(20,20)) img_vec=reshape(img_floats,1,imageSize) assert(size(img_vec)==(1,400)) @printf("%s %s\n",a,image) train[a,:]=img_vec end # We now need to make the ground truth labels # # We can make little functions funct=(x -> int(x[1])) funct("A") # 65 # And map them over things nearlywhatwewant=map(funct, labels[:Class]) # Unfortunately this doesn't appear to be good enough # This is a DataArray, whatever that means, and we want an Array, whatever that is # And this seems to do the conversion, although God knows why: trainlabels=int(nearlywhatwewant) # Let's just check that we're still sane char(trainlabels[20]) # should be a k ImageView.view(Images.grayim(reshape(train[20,:],20,20))) # should be a picture of a k # All this preparation having been done, we can now feed the data into # a random-forest making function: sherwood=DecisionTree.build_forest(trainlabels,train,20,50,1.0) #again, bloody ages ## Ensemble of Decision Trees ## Trees: 50 ## Avg Leaves: 2208.14 ## Avg Depth: 19.3 # Now, how well does this forest do on the data on which it was trained? shouldbegood=DecisionTree.apply_forest(sherwood, train) # Looks like it only got one wrong wrong=find(shouldbegood.!=trainlabels) # 3055 @printf("Testing a Random Forest on the data used to train it: errors=%s", size(wrong)[1]) char(shouldbegood[3055]) #E char(trainlabels[3055]) #1 # Apparently this 1 looks more like an E #Sure looks like a 1 to me! ImageView.view(Images.grayim(reshape(train[3055,:],20,20))) # But you can't fault the classifier on the other 6282 images.
Tuesday 6 January 2015
Julia (julialang) Kaggle Tutorial : Reading in the Images
#!/usr/bin/julia # Our next task is to read in all the training images and make them # into a big matrix. # The code on the kaggle website doesn't seem to work for me, but after # a bit of hunting around, I came up with this little program using DataFrames using Images # lots of warnings, but it's ok # Paths to our data files labelsfile="./trainLabels.csv" imagedirectory="./trainResized/" @printf("Reading %s\n", labelsfile) labels=readtable(labelsfile) no_of_images=size(labels)[1] @printf("Read in %d labels from %s\n", no_of_images, labelsfile) @printf("Image %i is of an %s\n",labels[1,1],labels[1,2]) @printf("reading %s images\n", no_of_images) # All the images have been resized to 20x20 in the trainResized directory imageSize=20*20 # So let's try and get the desired effect on the first image image="$(imagedirectory)1.Bmp" img=imread(image) # turn our colour image into a greyscale image img_gs=convert(Image{Gray},img) # turn the specialized image format into an array of floats img_floats=reinterpret(Float32,float32(img_gs)) # turn the 20x20 array into a 1x400 vector img_vec=reshape(img_floats,1,imageSize) # After all that, I feel the need to check I haven't buggered it up # There's a julia package for looking at images Pkg.add("ImageView") require("ImageView") # Should pop up a little grey 'n' ImageView.view(grayim(reshape(img_vec,20,20))) # Now we want to use that process to convert all the images into one # big array of image vectors # Create a gigantic array to put the images in x=zeros(no_of_images,imageSize) # We can iterate over a dataframe's columns by name (it takes a while!) for (a,b) in enumerate(labels[:ID]); image="$(imagedirectory)$(b).Bmp" img=imread(image) assert(size(img)==(20,20)) # paranoia img_gs=convert(Image{Gray},img) assert(size(img_gs)==(20,20)) img_floats=reinterpret(Float32,float32(img_gs)) assert(size(img_floats)==(20,20)) img_vec=reshape(img_floats,1,imageSize) assert(size(img_vec)==(1,400)) @printf("%s %s\n",a,image) x[a,:]=img_vec end # and one final paranoid check ImageView.view(grayim(reshape(x[6200,:],20,20))) labels[6200,:] # So that's how to turn 6000 or so images into one big matrix.
Monday 5 January 2015
Julia (julialang) Kaggle Tutorial : loading images and csv files
#!/usr/bin/julia # Here are my notes from playing with the code at the start of the Kaggle Julia tutorial # You only need to do this once for a given Julia installation, so I've commented it # out so it doesn't get done every time # Pkg.add("Images") # Pkg.add("DataFrames") # Paths to our data files labelsfile="./trainLabels.csv" imagedirectory="./trainResized/" # The ground truth for our dataset is in a csv file, which we can read # in using the DataFrames library using DataFrames # formatted print is a macro, hence the @ sign, it's much like C's printf @printf("Reading %s\n", labelsfile) labels=readtable(labelsfile) @printf("Read in %d labels from %s\n", size(labels)[2], labelsfile) @printf("Image %i is of an %s\n",labels[1,1],labels[1,2]) # indexing is like matlab, all of rows 1 to 3 labels[1:3,:] ## 3x2 DataFrame ## | Row | ID | Class | ## |-----|----|-------| ## | 1 | 1 | "n" | ## | 2 | 2 | "8" | ## | 3 | 3 | "T" | # using Images img=imread("$(imagedirectory)1.Bmp") # The kaggle julia tutorial then tells us to do: temp=float32(img) ## RGB Image with: ## data: 20x20 Array{RGB{Float32},2} ## properties: ## IMcs: sRGB ## spatialorder: x y ## pixelspacing: 1 1 ## What did that just do? help(float32) ## Base.float32(x) ## Convert a number or array to "Float32" data type # Not enormously the wiser now... # Our image is 2 dimensional ndims(temp) #2 # And 20x20 size(temp) #(20,20) # There's something funny going on here. This first image is a 20x20 colour bitmap # and yet the tutorial on the website has something about converting images to greyscale if ndims is 3. Our ndims is 2. # finally we want to convert our 20x20 image to a 1x400 vector imageSize=400 flat=reshape(temp,1,imageSize)
Sunday 4 January 2015
Julia (julialang) : Using EMACS
The hardest bit of learning any new language for me is always figuring out how to use it with EMACS.
EMACS is much harder to learn to use than any programming language.
If you're new to Julia, and following this to learn, then I very strongly advise that you use almost any other editor.
One way to use any editor to play with julia is to use the editor to create this file:
You can then run it with
julia --load firststeps.jl
and it will execute the commands in the file, and then leave you at a julia prompt where you can continue to play.
As you get incantations working, then you can cut and paste them from the terminal where you are working back into an editor.
But I went through the pain of learning EMACS a long time ago, and it is the case that once you have got the hang of it, you feel completely crippled in any other editor. I mean, will your editor turn your nice new program into a snippet of syntax highlighted html than you can cut and paste into blogger?
So for my fellow middle-aged computer scientists, on Debian 8, install emacs itself, and ess (emacs speaks statistics, which used to be for R and S, but now includes Julia support)
Create the file as above, which should be nicely syntax highlighted since ess will recognise the .jl file as Julia and highlight accordingly, and then
M-x ess-load-file
Will start an inferior julia process, and then pass it the command : include("/home/john/julia/kaggle/firststeps.jl")
leaving you with a running interpreter in a buffer *julia*
You can then use C-Enter to paste single lines from firststeps.jl into the *julia* buffer.
It is not exactly SLIME, but it is a damned sight better than almost any other approach I can think of. If you're already an EMACS user.
As I say, if you're not, just don't. You will spend the next year of your life fighting EMACS and messing with configuration files.
EMACS is much harder to learn to use than any programming language.
If you're new to Julia, and following this to learn, then I very strongly advise that you use almost any other editor.
One way to use any editor to play with julia is to use the editor to create this file:
#!/usr/bin/julia using Images using DataFrames img=imread("./trainResized/1.Bmp") labels=readtable("./trainLabels.csv") print("Hello!") print(img) print(labels)and then to save it as firststeps.jl
You can then run it with
julia --load firststeps.jl
and it will execute the commands in the file, and then leave you at a julia prompt where you can continue to play.
As you get incantations working, then you can cut and paste them from the terminal where you are working back into an editor.
But I went through the pain of learning EMACS a long time ago, and it is the case that once you have got the hang of it, you feel completely crippled in any other editor. I mean, will your editor turn your nice new program into a snippet of syntax highlighted html than you can cut and paste into blogger?
So for my fellow middle-aged computer scientists, on Debian 8, install emacs itself, and ess (emacs speaks statistics, which used to be for R and S, but now includes Julia support)
Create the file as above, which should be nicely syntax highlighted since ess will recognise the .jl file as Julia and highlight accordingly, and then
M-x ess-load-file
Will start an inferior julia process, and then pass it the command : include("/home/john/julia/kaggle/firststeps.jl")
leaving you with a running interpreter in a buffer *julia*
You can then use C-Enter to paste single lines from firststeps.jl into the *julia* buffer.
It is not exactly SLIME, but it is a damned sight better than almost any other approach I can think of. If you're already an EMACS user.
As I say, if you're not, just don't. You will spend the next year of your life fighting EMACS and messing with configuration files.
Julia (julialang) : Kaggle Competition and Julia Tutorial (First Steps with Julia)
So I want to try to do something real with Julia and see how easy it is to use, rather than working through tutorials.
Fortunately Kaggle (a site that is concerned with machine learning competitions) has a nice compromise: a set of test data combined with a Julia tutorial. The challenge is to recognise characters culled from Google Street View (roadsigns and such)
It's called First Steps with Julia, and I'm going to try it here.
If you'd like to follow along sign up with kaggle (you can just log straight in via google and openid), and download the data files.
I've made a directory, ~/julia/kaggle, and then unzipped the file trainResized.zip (which results in a directory called trainResized with lots of .Bmp files in it).
I also need the file trainLabels.csv, which goes in the ~/julia/kaggle directory.
Kaggle tell us:
http://www.kaggle.com/c/street-view-getting-started-with-julia/details/julia-tutorial
that we need the Julia packages Images and DataFrames, so fire up julia and say
julia> Pkg.add("Images")
julia> Pkg.add("DataFrames")
and now
julia> using Images
This produces a huge list of angry looking messages and a Segmentation fault! So Julia dies. However, if you start it again, and try
julia> using Images
then it seems to be fine.
With some trepidation:
julia> using DataFrames
produces a lot of warning messages, but returns to the prompt.
After that we seem to have the relevant libraries loaded, as you can verify :
First check that we're in the right directory:
julia> pwd()
"/home/john/julia/kaggle"
And then, if your files are as suggested above
julia> img=imread("./trainResized/1.Bmp")
julia> labels=readtable("./trainLabels.csv")
Fortunately Kaggle (a site that is concerned with machine learning competitions) has a nice compromise: a set of test data combined with a Julia tutorial. The challenge is to recognise characters culled from Google Street View (roadsigns and such)
It's called First Steps with Julia, and I'm going to try it here.
If you'd like to follow along sign up with kaggle (you can just log straight in via google and openid), and download the data files.
I've made a directory, ~/julia/kaggle, and then unzipped the file trainResized.zip (which results in a directory called trainResized with lots of .Bmp files in it).
I also need the file trainLabels.csv, which goes in the ~/julia/kaggle directory.
Kaggle tell us:
http://www.kaggle.com/c/street-view-getting-started-with-julia/details/julia-tutorial
that we need the Julia packages Images and DataFrames, so fire up julia and say
julia> Pkg.add("Images")
julia> Pkg.add("DataFrames")
and now
julia> using Images
This produces a huge list of angry looking messages and a Segmentation fault! So Julia dies. However, if you start it again, and try
julia> using Images
then it seems to be fine.
With some trepidation:
julia> using DataFrames
produces a lot of warning messages, but returns to the prompt.
After that we seem to have the relevant libraries loaded, as you can verify :
First check that we're in the right directory:
julia> pwd()
"/home/john/julia/kaggle"
And then, if your files are as suggested above
julia> img=imread("./trainResized/1.Bmp")
julia> labels=readtable("./trainLabels.csv")
Julia (julialang): Installation and Hello World
I've decided to have a go at julia, a new language for mathematical computing that's been described to me as 'like MATLAB, but not crap'.
First thing is to get it installed. Since I'm using Debian Jessie, which is just in its pre-release freeze, I figure that the julia package that comes with it should work.
That package suggests that I also install its docs and ess, which is the emacs mode for R and allied languages, so I eventually type:
sudo apt-get install julia julia-doc ess
This pulls in a lot of dependencies (including R, but that's not a bad thing to have around if you're planning on doing a bit of machine learning)
Once it's finished churning, typing
julia
results in this rather pretty display:
And gives me a read-eval-print loop which works as expected.
julia> 2*3
6
julia> print("hello")
hello
Next I try creating a file hello.jl
# Hello Julia
print("hello\n")
Amazingly, emacs (with ess installed) highlights this correctly.
Both:
julia hello.jl
and
julia <hello.jl
seem to run it, I don't know which is preferred
which julia
tells me that the julia program is:
/usr/bin/julia
And if I add a shebang line to hello.jl
#!/usr/bin/julia
and make the file executable with
chmod +x hello.jl
then I can run it with:
./hello.jl
For a new language, that all went remarkably smoothly, and it seems to work in the usual unixy way.
First thing is to get it installed. Since I'm using Debian Jessie, which is just in its pre-release freeze, I figure that the julia package that comes with it should work.
That package suggests that I also install its docs and ess, which is the emacs mode for R and allied languages, so I eventually type:
sudo apt-get install julia julia-doc ess
This pulls in a lot of dependencies (including R, but that's not a bad thing to have around if you're planning on doing a bit of machine learning)
Once it's finished churning, typing
julia
results in this rather pretty display:
_ _ _ _(_)_ | A fresh approach to technical computing (_) | (_) (_) | Documentation: http://docs.julialang.org _ _ _| |_ __ _ | Type "help()" for help. | | | | | | |/ _` | | | | |_| | | | (_| | | Version 0.3.2 _/ |\__'_|_|_|\__'_| | |__/ | x86_64-linux-gnuwhich is actually in colour on my terminal. Very modern!
And gives me a read-eval-print loop which works as expected.
julia> 2*3
6
julia> print("hello")
hello
Next I try creating a file hello.jl
# Hello Julia
print("hello\n")
Amazingly, emacs (with ess installed) highlights this correctly.
Both:
julia hello.jl
and
julia <hello.jl
seem to run it, I don't know which is preferred
which julia
tells me that the julia program is:
/usr/bin/julia
And if I add a shebang line to hello.jl
#!/usr/bin/julia
and make the file executable with
chmod +x hello.jl
then I can run it with:
./hello.jl
For a new language, that all went remarkably smoothly, and it seems to work in the usual unixy way.
Subscribe to:
Posts (Atom)