defget_qage(x, dct): for item in dct: if item.left < x <= item.right: return dct.get(item) # 对连续数值按论文方式处理 for col in CATEGORICAL_NUM_COLUMNS: nq = 10 df_train["Q"+col] = pd.qcut(df_train[col], nq) dct = dict(zip(df_train["Q"+col].values.categories, [i/(nq-1) for i in range(nq)])) df_train["Q"+col] = df_train["Q"+col].apply(lambda x: dct[x]) df_test["Q"+col] = df_test[col].apply(lambda x: get_qage(x, dct))
接下来就可以得到需要的训练数据了:
1 2 3
CONTINUOUS_COLUMNS = CONTINUOUS_COLUMNS + ["Q" + col for col in CATEGORICAL_NUM_COLUMNS] df_train = df_train[CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + [LABEL_COLUMN]] df_test = df_test[CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + [LABEL_COLUMN]]
defget_normalization_layer(name, dataset): # Create a Normalization layer for our feature. normalizer = preprocessing.Normalization() # Prepare a Dataset that only yields our feature. feature_ds = dataset.map(lambda x, y: x[name]) # Learn the statistics of the data. normalizer.adapt(feature_ds) return normalizer
defget_category_encoding_layer(name, dataset, dtype, max_tokens=None): # Create a StringLookup layer which will turn strings into integer indices if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_values=max_tokens) # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) # Learn the set of possible values and assign them a fixed integer index. index.adapt(feature_ds) # Create a Discretization for our integer indices. encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) # Prepare a Dataset that only yields our feature. feature_ds = feature_ds.map(index) # Learn the space of possible indices. encoder.adapt(feature_ds) # Apply one-hot encoding to our indices. The lambda function captures the # layer so we can use them, or include them in the functional model later. returnlambda feature: encoder(index(feature))