Twitterユーザーの性別を機械学習で予測する #Python

性別がタグ付けされたTwitterユーザー 2万人分のデータがあったので、このデータを使ってTwitterユーザーの性別予測を行ってみた。テキスト処理にはRuby、機械学習にはPythonを使っている。

先に結論

Twitterのプロフィールを用いた単純な機械学習による性別予測は、約60%の精度しかでなかった。

今回用いたデータは外国語のデータであり、日本語のプロフィールだと違った結果になるが、精度は同じようにあまりでないと思われる。こう思う理由は、「Twitterユーザーのデータは、そもそも人が見ても性別の判定が難しい」ため。

Twitterユーザー性別判定の手順

手順1〜5にはRuby、手順6にはPythonを使っている。

プロフィールに含まれる単語をリストアップする
各単語の出現回数を記録する
極端に出現回数が少ない、もしくは多すぎる単語は除去する
単語数を次元数とみなしてユーザーのプロフィールをベクトルで表現する
正解データのラベルを作成する
機械学習を適用する

Rubyでテキストの事前処理を行う

前述の手順1〜5を行うRubyコードは下記の通り。今回のような手法だと、このテキスト処理の部分に性能が大きく左右される。やり方は無数にあり、このコードでは本当に最小限のテキスト処理しか行っていない。

# https://www.kaggle.com/crowdflower/twitter-user-gender-classification
def parse_kaggle_data
  str = File.read('gender-classifier-DFE-791531.csv', encoding: 'ISO-8859-1:UTF-8')
  lines = str.split("\r").map { |l| l.split(',') }
  header = lines[0]
  users = lines.drop(1).map { |l| header.map.with_index { |h, i| [h, l[i]] }.to_h }
  users = users.select { |u| %w(female male).include?(u['gender']) && u['gender:confidence'] == '1' }
  [users.map { |u| u['description'] }, users.map { |u| u['gender'] }]
end

def split_to_words(text_array)
  text_array.map { |d| d.split(/([\s"]|__REP__)/) }.flatten.
      map { |w| w.gsub(/^#/, '') }.
      map { |w| w.gsub(/[^.]\.+$/, '') }.
      map { |w| w.gsub(/[^!]!+$/, '') }.
      map { |w| w.gsub(/^\(/, '') }.
      map { |w| w.gsub(/^\)/, '') }.
      delete_if { |w| w.length < 2 }.
      map(&:downcase).sort.uniq
end

def count_words(text_array, word_array)
  words_count = Hash.new(0)
  text_array.each do |d|
    word_array.each do |w|
      if d.include?(w)
        words_count[w] += 1
      end
    end
  end
  words_count
end

descriptions, genders = parse_kaggle_data

desc_words = split_to_words(descriptions)
desc_words_count = count_words(descriptions, desc_words)
filtered_desc_words = desc_words.select { |w| desc_words_count[w] > 2 && desc_words_count[w] < 500 }
desc_vectors = descriptions.map { |d| filtered_desc_words.map { |w| d.include?(w) ? 1 : 0 } }
File.write('data/description_vectors.txt', desc_vectors.map { |v| v.join(' ') }.join("\n"))

labels = genders.map do |g|
  case g
  when '';        0
  when 'brand';   1
  when 'female';  2
  when 'male';    3
  when 'unknown'; 4
  end
end
File.write('data/labels.txt', labels.join("\n"))

Pythonで機械学習を行う

ナイーブベイズ、ロジスティック回帰、ランダムフォレスト、サポートベクターマシンを試した結果、どれも似たような結果になっている。

手法	精度
ナイーブベイズ（正規分布）	0.5493
ナイーブベイズ（ベルヌーイ）	0.6367
ロジスティック回帰	0.6151
ランダムフォレスト	0.6339
サポートベクターマシン	0.6303

それぞれの手法には元データに対する暗黙の仮定があるが、今回はそれは考慮せず単純に結果を比較している点に注意が必要。

# sudo yum install -y python3
# sudo pip3 install -U pip numpy sklearn ipython

import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import pickle

description_vectors = np.loadtxt('data/description_vectors.txt')
labels = np.loadtxt('data/labels.txt')

(x_train, x_test, y_train, y_test) = train_test_split(description_vectors, labels)

clf = GaussianNB().fit(x_train, y_train)
clf = BernoulliNB().fit(x_train, y_train)
clf = LogisticRegression().fit(x_train, y_train)
clf = RandomForestClassifier().fit(x_train, y_train)
clf = SVC(C = 1.0).fit(x_train, y_train)

y_pred = clf.predict(x_test)
np.mean(y_test == y_pred)

# Grid search

# best params: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
parameters = [{'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'C': np.logspace(-2, 2, 5), 'gamma': ['scale']}]
clf = GridSearchCV(SVC(), parameters, verbose = True, n_jobs = -1)
clf.fit(x_train, y_train)

# best params: {'max_depth': 100, 'n_estimators': 300}
parameters = [{'n_estimators': [30, 50, 100, 300], 'max_depth': [25, 30, 40, 50, 100]}]
clf = GridSearchCV(RandomForestClassifier(), parameters, verbose = True, n_jobs = -1)
clf.fit(x_train, y_train)

print(clf.best_params_)
print(clf.best_score_)
print(clf.best_estimator_)

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Model persistence

pickle.dump(clf, open('model.sav', 'wb'))
clf = pickle.load(open('model.sav', 'rb'))

Twitterユーザーの性別を機械学習で予測する

先に結論

Twitterユーザー性別判定の手順

Rubyでテキストの事前処理を行う

Pythonで機械学習を行う

関連リンク