编辑:Peter 作者:Peter
介绍一个基于深度学习实战项目:基于长短期记忆模型LSTM的股价预测,包含:
LSTM的介绍 1、https://easyai.tech/ai-definition/lstm/
2、https://zh.d2l.ai/chapter_recurrent-modern/lstm.html
导入库 import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns sns.set_style('whitegrid' ) plt.style.use("fivethirtyeight" ) %matplotlib inlinefrom pandas_datareader.data import DataReader# 专门用来获取金融股票数据的第三方包 import yfinance as yffrom pandas_datareader import data as pdr yf.pdr_override()from datetime import datetimefrom sklearn.preprocessing import MinMaxScaler # 数据归一化 import warnings warnings.filterwarnings("ignore" )
生成数据 基于yfinance生成数据:
tech_list = ['AAPL' , 'GOOG' , 'MSFT' , 'AMZN' ] # 指定4个公司
end = datetime.now() # 股票时间设置 start = datetime(end.year - 3 , end.month, end.day) for stock in tech_list: # tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN'] globals()[stock] = yf.download(stock, start, end) # 指定公司名称+时间 company_list = [AAPL, GOOG, MSFT, AMZN] company_name = ["APPLE" , "GOOGLE" , "MICROSOFT" , "AMAZON" ]for company, com_name in zip(company_list, company_name): company["company_name" ] = com_name df = pd.concat(company_list, axis=0 ) df.tail()
[*********************100%%**********************] 1 of 1 completed [*********************100%%**********************] 1 of 1 completed [*********************100%%**********************] 1 of 1 completed [*********************100%%**********************] 1 of 1 completed
数据信息 AAPL.head() # 4个DataFrame: 'AAPL', 'GOOG', 'MSFT', 'AMZN'
AAPL.info()
<class 'pandas .core .frame .DataFrame '> DatetimeIndex : 752 entries, 2021-09-03 to 2024-08-30 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Open 752 non-null float64 1 High 752 non-null float64 2 Low 752 non-null float64 3 Close 752 non-null float64 4 Adj Close 752 non-null float64 5 Volume 752 non-null int64 6 company_name 752 non-null object dtypes: float64(5 ), int64(1 ), object(1 ) memory usage: 47.0 + KB
收盘价 Closing Price plt.figure(figsize=(15 , 10 )) plt.subplots_adjust(top=1.25 , bottom=1.2 )for i, company in enumerate(company_list, 1 ): plt.subplot(2 ,2 ,i) company["Adj Close" ].plot() plt.ylabel("Adj Close" ) plt.xlabel(None ) plt.title(f"Closing Price of {tech_list[i - 1 ]} " ) plt.tight_layout()
成交量 Volume of Sales plt.figure(figsize=(15 , 10 )) plt.subplots_adjust(top=1.25 , bottom=1.2 )for i, company in enumerate(company_list, 1 ): plt.subplot(2 ,2 ,i) company["Volume" ].plot() plt.ylabel("Volume" ) plt.xlabel(None ) plt.title(f"Sales Volume of {tech_list[i - 1 ]} " ) plt.tight_layout()
不同股票的移动平均值Moving Average of Stocks 增加移动平均字段 ma_day = [5 ,10 ,20 ,50 ]for ma in ma_day: for company in company_list: column_name = f"MA for {ma} days"
company[column_name] = company["Adj Close" ].rolling(ma).mean() # 滑动平均ma天后的均值
company.columns
Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'company_name', 'MA for 5 days', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days'], dtype='object')
可视化效果 fig, axes = plt.subplots(nrows=2 , ncols=2 ) fig.set_figheight(10 ) fig.set_figwidth(15 ) AAPL[['Adj Close' , 'MA for 5 days' , 'MA for 10 days' ,'MA for 20 days' , 'MA for 50 days' ]].plot(ax=axes[0 ,0 ]) axes[0 ,0 ].set_title('APPLE' ) GOOG[['Adj Close' , 'MA for 5 days' , 'MA for 10 days' , 'MA for 20 days' , 'MA for 50 days' ]].plot(ax=axes[0 ,1 ]) axes[0 ,1 ].set_title('GOOGLE' ) MSFT[['Adj Close' , 'MA for 5 days' , 'MA for 10 days' , 'MA for 20 days' , 'MA for 50 days' ]].plot(ax=axes[1 ,0 ]) axes[1 ,0 ].set_title('MICROSOFT' ) AMZN[['Adj Close' , 'MA for 5 days' , 'MA for 10 days' , 'MA for 20 days' , 'MA for 50 days' ]].plot(ax=axes[1 ,1 ]) axes[1 ,1 ].set_title('AMAZON' ) fig.tight_layout()
从上图中可以观察到,5日和10日均线能够更好地捕捉到数据的变化趋势。
日收益daily return pct_change 函数在 pandas 中非常有用,特别是在处理时间序列数据时。这个函数计算序列中每个元素与前一个元素之间的百分比变化。它通常用于财务数据分析,比如计算股票价格、货币汇率、销售量等的日百分比变化率。
计算过程pct_change for company in company_list: company["Daily Return" ] = company["Adj Close" ].pct_change() # pct_change当前元素和前一个元素的百分比变化 company.head()
可视化(散点图)
fig, axes = plt.subplots(nrows=2 , ncols=2 ) fig.set_figheight(10 ) fig.set_figwidth(15 ) AAPL['Daily Return' ].plot(ax=axes[0 ,0 ], legend=True , linestyle='--' , marker='o' ) axes[0 ,0 ].set_title('APPLE' ) GOOG['Daily Return' ].plot(ax=axes[0 ,1 ], legend=True , linestyle='--' , marker='o' ) axes[0 ,1 ].set_title('GOOGLE' ) MSFT['Daily Return' ].plot(ax=axes[1 ,0 ], legend=True , linestyle='--' , marker='o' ) axes[1 ,0 ].set_title('MICROSOFT' ) AMZN['Daily Return' ].plot(ax=axes[1 ,1 ], legend=True , linestyle='--' , marker='o' ) axes[1 ,1 ].set_title('AMAZON' ) fig.tight_layout()
可视化(直方图hist) plt.figure(figsize=(12 , 9 ))for i, company in enumerate(company_list, 1 ): plt.subplot(2 , 2 , i) # 子图位置 # 绘图数据调用hist函数 company['Daily Return' ].hist(bins=50 ) # 直方图箱体个数 plt.xlabel('Daily Return' ) # xy轴标题 plt.ylabel('Counts' ) plt.title(f'{company_name[i - 1 ]} ' ) # 图的标题 plt.tight_layout()
相关性 单独生成股价的收盘价作为closing_df:
# pandas_datareader pdr closing_df = pdr.get_data_yahoo(tech_list, start=start, end=end)['Adj Close' ] closing_df.head()
[*********************100%%**********************] 4 of 4 completed
对closing_df使用pct_change函数:
tech_rets = closing_df.pct_change() tech_rets.head()
可视化展示 自相关 sns.jointplot(x='GOOG' , y='GOOG' , data=tech_rets, kind='scatter' , color='seagreen' ) plt.show()
两两相关 sns.jointplot(x='GOOG' , y='MSFT' , data=tech_rets, kind='scatter' , color='blue' ) plt.show()
sns.jointplot(x='GOOG' , y='AMZN' , data=tech_rets, kind='scatter' , color='blue' ) plt.show()
整体相关性(sns.pairplot) sns.pairplot(tech_rets, kind='reg' ) plt.show()
整体相关性(sns.PairGrid) 使用sns.PairGrid()可以让我们更灵活地控制图表的布局和类型:
return_fig = sns.PairGrid(tech_rets.dropna()) return_fig.map_upper(plt.scatter, color='purple' ) return_fig.map_lower(sns.kdeplot, cmap='cool_d' ) return_fig.map_diag(plt.hist, bins=30 ) plt.show()
收盘价的整体相关性:
returns_fig = sns.PairGrid(closing_df) # 收盘价 returns_fig.map_upper(plt.scatter,color='purple' ) returns_fig.map_lower(sns.kdeplot,cmap='cool_d' ) returns_fig.map_diag(plt.hist,bins=30 ) plt.show()
热力图heatmap plt.figure(figsize=(12 , 10 )) plt.subplot(2 , 2 , 1 ) sns.heatmap(tech_rets.corr(), annot=True , cmap='summer' ) # 日回报率 plt.title('Correlation of stock return' ) plt.subplot(2 , 2 , 2 ) sns.heatmap(closing_df.corr(), annot=True , cmap='summer' ) # 收盘价 plt.title('Correlation of stock closing price' ) plt.show()
风险评估value risk 有许多方法可以量化风险,使用收集到的每日百分比回报率信息来量化风险的最基本方法之一,是通过将预期回报率 与每日回报率的标准差 进行比较。
rets = tech_rets.dropna() area = np.pi * 20 plt.figure(figsize=(10 ,8 )) plt.scatter(rets.mean(), rets.std(),s=area) plt.xlabel("Expected Return" ) plt.ylabel("Risk" )for label, x, y in zip(rets.columns, rets.mean(), rets.std()): plt.annotate(label, xy=(x, y), xytext=(50 , 50 ), textcoords='offset points' , ha='right' , va='bottom' , arrowprops=dict(arrowstyle='-' , color='blue' , connectionstyle='arc3,rad=-0.3' ))
建模预测Predicting the closing price 项目地址:https://www.kaggle.com/code/faressayah/stock-market-analysis-prediction-using-lstm
生成数据 df = pdr.get_data_yahoo("AAPL" , start="2012-01-01" , end=datetime.now()) df.tail()
可视化 plt.figure(figsize=(16 ,6 )) plt.plot(df['Close' ]) # 绘图使用数据 plt.xlabel('Date' , fontsize=18 ) # x-y轴标题 plt.ylabel('Close Price USD ($)' , fontsize=18 ) plt.title('Close Price History' ) # 图的标题 plt.show()
提取Close数据 data = df.filter(["Close" ]) dataset = data.values dataset[:5 ] array([[14.6867857 ], [14.76571369 ], [14.92964268 ], [15.08571434 ], [15.0617857 ]])
训练集数据长度 training_dataset_len = int(np.ceil(len(dataset) * 0.95 )) training_dataset_len
3027
数据归一化 from sklearn.preprocessing import MinMaxScaler # 数据归一化MinMaxScaler scaler = MinMaxScaler(feature_range=(0 ,1 )) scaled_data = scaler.fit_transform(dataset) # 整体数据的归一化过程 scaled_data array([[0.00334711 ], [0.00370446 ], [0.00444665 ], ..., [0.96228593 ], [0.97722662 ], [0.97364992 ]])
选择训练集数据 选择指定长度下的训练集数据:
train_data = scaled_data[0 :int(training_dataset_len), :]
切分训练集
将train_data切分成x_train和y_train:
x_train, y_train = [], []
for i in range(60 , len(train_data)): x_train.append(train_data[i-60 :i, 0 ]) y_train.append(train_data[i, 0 ]) if i <= 61 : print(x_train) print(y_train) print()
x_train, y_train = np.array(x_train), np.array(y_train)# shape转换 x_train = np.reshape(x_train, (x_train.shape[0 ], x_train.shape[1 ], 1 ))
构建LSTM网络 from keras.models import Sequentialfrom keras.layers import Dense, LSTM
model = Sequential() model.add(LSTM(128 ,return_sequences=True , input_shape=(x_train.shape[1 ], 1 ))) model.add(LSTM(64 , return_sequences=False )) model.add(Dense(25 )) model.add(Dense(1 ))
编译&训练网络compile+fit # 编译 model.compile(optimizer="adam" , loss="mean_squared_error" )# 训练 model.fit(x_train, y_train, batch_size=1 , epochs=1 )2967 /2967 [==============================] - 36 s 11 ms/step - loss: 7.2025e-04
测试集生成 test_data = scaled_data[training_dataset_len-60 :, :] x_test = [] y_test = dataset[training_dataset_len:, :]for i in range(60 , len(test_data)): x_test.append(test_data[i-60 :i, 0 ])
x_test = np.array(x_test)
# shape转换 x_test = np.reshape(x_test, (x_test.shape[0 ], x_test.shape[1 ], 1 ))
预测过程predict
predictions = model.predict(x_test) predictions = scaler.inverse_transform(predictions) # 将数据进行还原
5/5 [==============================] - 1s 16ms/step
计算RMSE rmse = np.sqrt(np.mean(((predictions - y_test)**2 ))) rmse
4.422715565017136
预测可视化 train = data[:training_dataset_len] # 训练集 valid = data[training_dataset_len:] # 验证集 valid["Predictions" ] = predictions
plt.figure(figsize=(16 ,6 )) plt.plot(train['Close' ]) plt.plot(valid[['Close' , 'Predictions' ]])# xy轴标题、图例、图标题生成 plt.xlabel('Date' , fontsize=18 ) plt.ylabel('Close Price USD ($)' , fontsize=18 ) plt.legend(['Train' , 'Val' , 'Predictions' ], loc='lower right' ) plt.title('Model' ) plt.show()