import pandas
data_train = pandas.read_csv('./salary_train.csv', index_col='Id')
data_test = pandas.read_csv('./salary_test.csv', index_col='Id')
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
%%time
cat_features = [
'LocationNormalized',
'ContractType',
'ContractTime',
'Company',
'Category',
]
cat_vectorizer = DictVectorizer()
X_cat_train = cat_vectorizer.fit_transform(data_train[cat_features].fillna('').T.to_dict().values())
X_cat_test = cat_vectorizer.transform(data_test[cat_features].fillna('').T.to_dict().values())
%%time
text_vectorizer = TfidfVectorizer(min_df=10)
X_text_train = text_vectorizer.fit_transform(data_train['FullDescription'])
X_text_test = text_vectorizer.transform(data_test['FullDescription'])
from scipy import sparse
X_train = sparse.hstack([X_cat_train, X_text_train])
X_test = sparse.hstack([X_cat_test, X_text_test])
y_train = data_train['SalaryNormalized']
from sklearn.linear_model import Ridge
model = Ridge()
%%time
from sklearn.cross_validation import cross_val_score, ShuffleSplit
splits = ShuffleSplit(n=X_train.shape[0], n_iter=3)
print cross_val_score(model, X_train, y_train, scoring='mean_absolute_error')
%%time
model = Ridge().fit(X_train, y_train)
y_test_pred = model.predict(X_test)
import pandas
pandas.DataFrame(
{'SalaryPredicted': y_test_pred},
index=data_test.index,
).to_csv('sample_submission.csv')