Learn practical skills, build real-world projects, and advance your career
# Always start with checking out the files!
!ls
item_categories.csv sample_submission.csv.gz items.csv shops.csv sales_train.csv test.csv sales_train.csv.gz test.csv.gz sample_submission.csv time_series_tutorials.ipynb
# Basic packages
import numpy as np   # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd  # generating random numbers
import datetime      # manipulating date formats

# Viz
import matplotlib.pyplot as plt  # basic plotting
import seaborn as sns            # prettier plot

# Time series
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf, arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

# Settings
import warnings
warnings.filterwarnings("ignore")  # ignoring warning messages
# Import all of them
sales = pd.read_csv("sales_train.csv")
item_cat =  pd.read_csv("item_categories.csv")
item = pd.read_csv("items.csv")
sub = pd.read_csv("sample_submission.csv")
shops = pd.read_csv("shops.csv")
test = pd.read_csv("test.csv")
# formatting the 'date' column correctly (e.g. yyyy-mm-dd)
# 데이터 프레임의 한 열에 대해 함수를 적용하고 싶을 때는 ColumnName.apply(lambda x: ...)를 활용하자.
sales['date'] = sales['date'].apply(lambda x: datetime.datetime.strptime(x, "%d.%m.%Y"))
# checking whether the 'date' column type is 'datetimes64' or not
print(sales.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2935849 entries, 0 to 2935848 Data columns (total 6 columns): date datetime64[ns] date_block_num int64 shop_id int64 item_id int64 item_price float64 item_cnt_day float64 dtypes: datetime64[ns](1), float64(2), int64(3) memory usage: 134.4 MB None