Learn practical skills, build real-world projects, and advance your career
Created 5 years ago
# Always start with checking out the files!
!ls
item_categories.csv sample_submission.csv.gz
items.csv shops.csv
sales_train.csv test.csv
sales_train.csv.gz test.csv.gz
sample_submission.csv time_series_tutorials.ipynb
# Basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
# Viz
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # prettier plot
# Time series
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf, arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
# Settings
import warnings
warnings.filterwarnings("ignore") # ignoring warning messages
# Import all of them
sales = pd.read_csv("sales_train.csv")
item_cat = pd.read_csv("item_categories.csv")
item = pd.read_csv("items.csv")
sub = pd.read_csv("sample_submission.csv")
shops = pd.read_csv("shops.csv")
test = pd.read_csv("test.csv")
# formatting the 'date' column correctly (e.g. yyyy-mm-dd)
# 데이터 프레임의 한 열에 대해 함수를 적용하고 싶을 때는 ColumnName.apply(lambda x: ...)를 활용하자.
sales['date'] = sales['date'].apply(lambda x: datetime.datetime.strptime(x, "%d.%m.%Y"))
# checking whether the 'date' column type is 'datetimes64' or not
print(sales.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date datetime64[ns]
date_block_num int64
shop_id int64
item_id int64
item_price float64
item_cnt_day float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 134.4 MB
None