#!/usr/bin/env python
"""
Thierry Bertin-Mahieux (2011) Columbia University
tb2332@columbia.edu
This code creates a dataset based on 'genre', whatever we
can infer from tags
This is part of the Million Song Dataset project from
LabROSA (Columbia University) and The Echo Nest.
Copyright 2011, Thierry Bertin-Mahieux
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
"""
import os
import sys
import sqlite3
import numpy as np
from operator import itemgetter
import hdf5_getters as GETTERS
# list of 10 genres hand-picked from musicbrainz
# (loosely inspired by the genre in GZTAN genre dataset)
GENRES=('classic pop and rock','punk','folk','pop','dance and electronica','metal','jazz and blues','classical','hip-hop','soul and reggae')
def path_from_trackid(msddir,trackid):
"""
Create a full path from the main MSD dir and a track id.
Does not check if the file actually exists.
"""
p = os.path.join(msddir,trackid[2])
p = os.path.join(p,trackid[3])
p = os.path.join(p,trackid[4])
p = os.path.join(p,trackid.upper()+'.h5')
return p
def feat_names():
""" return the name of each feature return by the following function """
# basic global info
res = ['track_id','artist_name','title','loudness','tempo','time_signature','key','mode','duration']
# avg timbre, var timbre
for k in range(1,13):
res.append( 'avg_timbre'+str(k))
for k in range(1,13):
res.append( 'var_timbre'+str(k))
# done
return res
def feat_from_file(path):
"""
Extract a list of features in an array, already converted to string
"""
feats = []
h5 = GETTERS.open_h5_file_read(path)
# basic info
feats.append( GETTERS.get_track_id(h5) )
feats.append( GETTERS.get_artist_name(h5).replace(',','') )
feats.append( GETTERS.get_title(h5).replace(',','') )
feats.append( GETTERS.get_loudness(h5) )
feats.append( GETTERS.get_tempo(h5) )
feats.append( GETTERS.get_time_signature(h5) )
feats.append( GETTERS.get_key(h5) )
feats.append( GETTERS.get_mode(h5) )
feats.append( GETTERS.get_duration(h5) )
# timbre
timbre = GETTERS.get_segments_timbre(h5)
avg_timbre = np.average(timbre,axis=0)
for k in avg_timbre:
feats.append(k)
var_timbre = np.var(timbre,axis=0)
for k in var_timbre:
feats.append(k)
# done with h5 file
h5.close()
# makes sure we return strings
feats = map(lambda x: str(x), feats)
return feats
def die_with_usage():
""" HELP MENU """
print 'create_genre_dataset.py'
print ' by T. Bertin-Mahieux (2011) Columbia University'
print ' tb2332@columbia.edu'
print ''
print 'USAGE'
print ' ./create_genre_dataset.py