Grafik Problemi(matplotlib)

tngrfstk · Mayıs 6, 2021, 9:16öö

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

from sklearn.preprocessing import RobustScaler

plt.style.use("bmh")

import ta

from datetime import timedelta

from keras.models import Sequential

from keras.layers import LSTM, Dense, Dropout

df = pd.read_csv("USDTRY=X.csv")

df['Date'] = pd.to_datetime(df.Date)

# Setting the index

df.set_index('Date', inplace=True)

# Dropping any NaNs

df.dropna(inplace=True)

# Adding all the indicators

df = ta.add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)

# Dropping everything else besides 'Close' and the Indicators

df.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1, inplace=True)

# Checking the new df with indicators

print(df.shape)

# Only using the last 1000 days of data to get a more accurate representation of the current climate

df = df.tail(1000)

# Scale fitting the close prices separately for inverse_transformations purposes later

close_scaler = RobustScaler()

close_scaler.fit(df[['Close']])

# Normalizing/Scaling the Data

scaler = RobustScaler()

df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# Plotting the Closing Prices

df['Close'].plot(figsize=(16,5))

plt.title("Satis Fiyati")

plt.ylabel("Fiyat(olcekli)")

# plt.show()

def split_sequence(seq, n_steps_in, n_steps_out):

    """

    Splits the multivariate time sequence

    """

    

    # Creating a list for both variables

    X, y = [], []

    

    for i in range(len(seq)):

        

        # Finding the end of the current sequence

        end = i + n_steps_in

        out_end = end + n_steps_out

        

        # Breaking out of the loop if we have exceeded the dataset's length

        if out_end > len(seq):

            break

        

        # Splitting the sequences into: x = past prices and indicators, y = prices ahead

        seq_x, seq_y = seq[i:end, :], seq[end:out_end, 0]

        

        X.append(seq_x)

        y.append(seq_y)

    

    return np.array(X), np.array(y)

def visualize_training_results(results):

    """

    Plots the loss and accuracy for the training and testing data

    """

    history = results.history

    plt.figure(figsize=(16,5))

    plt.plot(history['val_loss'])

    plt.plot(history['loss'])

    plt.legend(['val_loss', 'loss'])

    plt.title('Loss')

    plt.xlabel('Epochs')

    plt.ylabel('Loss')

    plt.show()

    

    plt.figure(figsize=(16,5))

    plt.plot(history['val_accuracy'])

    plt.plot(history['accuracy'])

    plt.legend(['val_accuracy', 'accuracy'])

    plt.title('Dogruluk')

    plt.xlabel('Okuma')

    plt.ylabel('Dogruluk')

    plt.show()

def layer_maker(n_layers, n_nodes, activation, drop=None, d_rate=.5):

    """

    Creates a specified number of hidden layers for an RNN

    Optional: Adds regularization option - the dropout layer to prevent potential overfitting (if necessary)

    """

    

    # Creating the specified number of hidden layers with the specified number of nodes

    for x in range(1,n_layers+1):

        model.add(LSTM(n_nodes, activation=activation, return_sequences=True))

        # Adds a Dropout layer after every Nth hidden layer (the 'drop' variable)

        try:

            if x % drop == 0:

                model.add(Dropout(d_rate))

        except:

            pass

def validater(n_per_in, n_per_out):

    """

    Runs a 'For' loop to iterate through the length of the DF and create predicted values for every stated interval

    Returns a DF containing the predicted values for the model with the corresponding index values based on a business day frequency

    """

    

    # Creating an empty DF to store the predictions

    predictions = pd.DataFrame(index=df.index, columns=[df.columns[0]])

    for i in range(1, len(df)-n_per_in, n_per_out):

        # Creating rolling intervals to predict off of

        x = df[-i - n_per_in:-i]

        # Predicting using rolling intervals

        yhat = model.predict(np.array(x).reshape(1, n_per_in, n_features))

        # Transforming values back to their normal prices

        yhat = close_scaler.inverse_transform(yhat)[0]

        # DF to store the values and append later, frequency uses business days

        pred_df = pd.DataFrame(yhat, 

                               index=pd.date_range(start=x.index[-1]+timedelta(days=1), 

                                                   periods=len(yhat), 

                                                   freq="B"),

                               columns=[x.columns[0]])

        # Updating the predictions DF

        predictions.update(pred_df)

        

    return predictions

def val_rmse(df1, df2):

    """

    Calculates the root mean square error between the two Dataframes

    """

    df = df1.copy()

    

    # Adding a new column with the closing prices from the second DF

    df['close2'] = df2.Close

    

    # Dropping the NaN values

    df.dropna(inplace=True)

    

    # Adding another column containing the difference between the two DFs' closing prices

    df['diff'] = df.Close - df.close2

    

    # Squaring the difference and getting the mean

    rms = (df[['diff']]**2).mean()

    

    # Returning the sqaure root of the root mean square

    return float(np.sqrt(rms))

# How many periods looking back to learn

n_per_in  = 30

# How many periods to predict

n_per_out = 10

# Features 

n_features = df.shape[1]

# Splitting the data into appropriate sequences

X, y = split_sequence(df.to_numpy(), n_per_in, n_per_out)

# Instatiating the model

model = Sequential()

# Activation

activ = "tanh"

# Input layer

model.add(LSTM(90, 

               activation=activ, 

               return_sequences=True, 

               input_shape=(n_per_in, n_features)))

# Hidden layers

layer_maker(n_layers=2, 

            n_nodes=30, 

            activation=activ,

            drop=1,

            d_rate=.1)

# Final Hidden layer

model.add(LSTM(90, activation=activ))

# Output layer

model.add(Dense(n_per_out))

# Model summary

model.summary()

# Compiling the data with selected specifications

model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

res = model.fit(X, y, epochs=2, batch_size=32, validation_split=0.1)

visualize_training_results(res)

# Transforming the actual values to their original price

actual = pd.DataFrame(close_scaler.inverse_transform(df[["Close"]]), 

                      index=df.index, 

                      columns=[df.columns[0]])

# Getting a DF of the predicted values to validate against

predictions = validater(n_per_in, n_per_out)

# Printing the RMSE

print("RMSE:", val_rmse(actual, predictions))

    

# Plotting

plt.figure(figsize=(16,6))

# Plotting those predictions

plt.plot(predictions, label='Tahmin edilen')

# Plotting the actual values

plt.plot(actual, label='Gerçek')

plt.title(f"Tahmin ve Gercek Fiyat")

plt.ylabel("Fiyat")

plt.legend()

plt.show()

# Predicting off of the most recent days from the original DF

yhat = model.predict(np.array(df.tail(n_per_in)).reshape(1, n_per_in, n_features))

# Transforming the predicted values back to their original format

yhat = close_scaler.inverse_transform(yhat)[0]

# Creating a DF of the predicted prices

preds = pd.DataFrame(yhat, 

                     index=pd.date_range(start=df.index[-1]+timedelta(days=1), 

                                         periods=len(yhat), 

                                         freq="B"), 

                     columns=[df.columns[0]])

    # Number of periods back to plot the actual values

pers = n_per_in

# Transforming the actual values to their original price

actual = pd.DataFrame(close_scaler.inverse_transform(df[["Close"]].tail(pers)), 

                      index=df.Close.tail(pers).index, 

                      columns=[df.columns[0]]).append(preds.head(1))

# Printing the predicted prices

print(preds)

# Plotting

plt.figure(figsize=(15,6))

plt.plot(preds, label="Tahmini Fiyat")

plt.plot(actual, label="Gercek Fiyat")

plt.ylabel("Fiyat")

plt.xlabel("Tarihler")

plt.title(f"Onumuzdeki {len(yhat)} gun icin")

plt.legend()

plt.show()

Merhabalar,
Elimde bu şekilde bir kod var. Kodun mantığını kabaca anladım. Fakat gelecek günkü tahminlerde grafikte bozulma oluyor(Screenshot by Lightshot). Dolar ile alakası yok. BTC EUR filan hepsinde oluyor aynı şey. Kur geçmişini yahoo finance den alıyorum.
Kodun orijinal hali burada (Price-Forecaster/Stock-RNN-Deep-Learning-TechIndicators.ipynb at master · marcosan93/Price-Forecaster · GitHub)

Bu sayfadaki " Validating the Model" kısmından hata verdiği için plt.xlim('2018-05', '2020-05') burayı çıkarttım ve çıkarttıktan sonra hata vermemeye başladı. Bu hatayı verdiği yer “tahmin ve gerçek fiyat” grafiği, “gelecek günkü tahmin” grafiği ile alakası yok gibi ama sizce grafikteki problem burayı çıkarttığım için mi oldu ? Sizce grafikte bozulma neden oluyor ve nasıl düzeltebilirim ?

anon18277073 · Mayıs 6, 2021, 1:21ös

Merhaba bozulmadan kastınız nedir tam olarak, nasıl bir grafik beklemekteydiniz?

tngrfstk · Mayıs 6, 2021, 2:00ös

usd-try olsun btc-usd olsun tahmini değerin başlangıcından önce, gerçek fiyat grafikte düşüş yaşıyor. Verilerimde bu şekilde bir veri yok halbuki Screenshot by Lightshot

anon18277073 · Mayıs 6, 2021, 2:25ös

En sondaki preds ve actual için,

print(preds) ve print(actual)'ı paylaşabilir misiniz?

tngrfstk · Mayıs 6, 2021, 3:41ös

preds

               Close
2021-05-06  7.056682
2021-05-07  6.994374
2021-05-10  6.947605
2021-05-11  6.982694
2021-05-12  6.995717
2021-05-13  6.988362
2021-05-14  6.955207
2021-05-17  6.906001
2021-05-18  6.916530
2021-05-19  6.963354
2021-05-20  7.007742
2021-05-21  7.021664
2021-05-24  7.016685
2021-05-25  7.009973
2021-05-26  7.128006
2021-05-27  7.152076
2021-05-28  7.158733
2021-05-31  7.181411
2021-06-01  7.203786
2021-06-02  7.286486
2021-06-03  7.226585
2021-06-04  7.307771
2021-06-07  7.291287
2021-06-08  7.306860
2021-06-09  7.241479
2021-06-10  7.314354
2021-06-11  7.315176
2021-06-14  7.315245
2021-06-15  7.307352
2021-06-16  7.380007

actual

               Close
2020-12-31  7.373730
2021-01-01  7.433800
2021-01-04  7.433420
2021-01-05  7.421500
2021-01-06  7.383790
...              ...
2021-04-30  8.184880
2021-05-03  8.274550
2021-05-04  8.259000
2021-05-05  8.321720
2021-05-06  7.056682

[91 rows x 1 columns]

Sonda birden iniş yaşıyor gibi. Sizce ne yapmalıyım ?

anon18277073 · Mayıs 6, 2021, 3:57ös

Evet, sebebi de

bu kısım. actual’ın sonuna ilk tahmin değeri ekleniyor. Galiba görsel açıdan arada kopukluk olmasın diye yapılmış. Bu .append’i kaldırırsanız düzelir diye tahmin ediyorum.

tngrfstk · Mayıs 6, 2021, 5:13ös

Uğraştığınız için teşekürler. Benim anlamadığım başka bir şey ise, tahmini değerler her dövizde gerçek değerin çok altında başlıyor.

Burada adamın yaptığında tahmini değer gerçek değerin devamı niteliğinde, fakat bende sürekli ve her dövizde gerçek değerin aşağısından başlıyor.Sizce bu “epochs” miktarının azlığından mı kaynaklı ?

anon18277073 · Mayıs 6, 2021, 5:29ös

Olabilir evet, belki başka nedenler de olabilir. visualize_training_results sizde nasıl grafikler veriyor?

tngrfstk · Mayıs 6, 2021, 6:45ös

![Figure_3|690x276]

“epochs” değerini 1000 yaptım. Gözlemlerime göre epochs i ne kadar arttırırsam gelecek günki tahmin arttıdığım miktara oranla o kadar aşağıda başlıyor.

tngrfstk · Mayıs 6, 2021, 7:32ös

Hala anlayabilmiş değilim. Bu grafik adamın grafiği, ve gelecek günki tahmini asıl grafik ile uyum halinde. Bende ise gelecek tahmini grafiği gerçek grafikten aşağıda başlıyor. : (

acaba burayı çıkarttığım için mi oldu. Benim çalıştırdığım kod ile adamın kodunun tek farkı burası.
Çalıştırdığım koda adamın yaptığı gibi bunu ekliyince plt.xlim('2018-05', '2020-05') bu hatayı veriyor:

matplotlib.units.ConversionError: Failed to convert value(s) to axis units: '2005-05'

Yardımcı olabilirseniz çok çok sevinirim. 2 gündür bu sorunu çözmeye çalışıyorum

anon18277073 · Mayıs 6, 2021, 7:59ös

Yok, o x-ekseninin limitlerini ayarlamaya çalışıyor, sizde zaten kırmızının bittiği yerden mavi başlıyor bir sorun yok diye düşünüyorum. Yine de yapmak isterseniz o değerleri datetime’a çevirmeniz gerekebilir veya diğer elemanların formatını takip etmek gerekebilir (2018-05 yerine %Y-%m-%d, mesela 2018-05-01).

Validation loss bir yerden sonra artmaya başlamış ve dolayısıyla overfitting’e uğramış model. Eğer eğitildiği veri üzerindeki tahminini çizdirirseniz heralde neredeyse birebir takip ediyordur. Ama ertesi günler için performans veremiyor haliyle. (Bu arada accuracy böylesi bir regresyon task’ı için uygun bir metrik değil, o daha çok sınıflandırmada kullanılıyor.)

Şunları deneyebilirsiniz denemediyseniz:

epoch’u 100-400 arası tutmak iyi olabilir
optimizer’ın learning rate’i ile oynanabilir; default 0.01 galiba belki 0.05 ve suları denenebilir
batch_size da değiştirilebilir; ne kadar veri var bilmiyorum ama 64 veya 16 da denenebilir
Katman sayısı 1’e indirilebilir; hidden unit sayısı da 20’ye çekilebilir
Scaler Robust değil de düz MinMax veya StandardScaler olarak değiştirilebilir
Early stopping koyulabilir epoch’u kısıtlamadan; epoch 1_000 olabilir o durumda, kendi dursun. (ama duracağı metrik accuracy olmasın, pek makul değil)
Önemli bir parametre de kaç gün geriye baktığı. 30 çok olabilir.

Bunların hepsi varsayım, denemeden şunu yapın kesin daha iyi olur diyebilecek bir yetkinliğim yok. Bunları deneyip de yine benzeri sonuçla karşılaşabilirsiniz. Model şöyle ya da böyle sihirli bir değnek görevi göremeyip, orada tıkanabilir.

tngrfstk · Mayıs 6, 2021, 9:04ös

Teşekkürler, deneyeceğim.

tngrfstk · Mayıs 6, 2021, 10:17ös

Hepsini denedim ama olmadı. Saldım artık bu adamın kodunu : ) . Kendim derin öğrenmenin mantığını anlayarak yapmaya çalışacağım.