import dask.array as da x = da.random.uniform(low=0, high=10, size=(10000, 10000), # normal numpy code chunks=(1000, 1000)) # break into chunks of size 1000x1000
y = x + x.T - x.mean(axis=0) # Use normal syntax for high level algorithms
# DataFrames import
dask.dataframe as dd df = dd.read_csv('2018-*-*.csv', parse_dates='timestamp', # normal Pandas code blocksize=64000000) # break text into 64MB chunks
s = df.groupby('name').balance.mean() # Use normal syntax for high level algorithms
# Bags / lists import dask.bag as db b = db.read_text('*.json').map(json.loads) total = (b.filter(lambda d: d['name'] == 'Alice') .map(lambda d: d['balance']) .sum())
# from sklearn.grid_search import GridSearchCV from dklearn.grid_search import GridSearchCV # from sklearn.pipeline import Pipeline from dklearn.pipeline import Pipeline
下面是一个使用Pipeline的示例,其中应用了PCA和逻辑回归。
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=500,
n_classes=2, n_redundant=250, random_state=42)
from sklearn import linear_model, decomposition from sklearn.pipeline import Pipeline from dklearn.pipeline import Pipeline