캔들차트 CNN 분석 - S0~S2

S0. 종목 선택

KOSPI200 의 시가총액 1위인 삼성전자 주식을 선택했다.

S1. 데이터 찾기

FinanceDataReader를 통해 불러온 일일 주가데이터를 사용한다.
불러온 데이터를 다시 csv파일로 저장해서 용이하게 한다.

def removeOutput(filepath):
    if(Path(filepath)).is_file():
        os.remove(filepath)
    
def csv_initiator(market, ticker, head_date, tail_date):
    df = fdr.DataReader(ticker, head_date, tail_date, exchange=market)
    
    filedir = os.getcwd() + '\\dataset\\raw_data\\'
    filename = "{}_{}.csv".format(market, ticker)    
    filepath = filedir + filename
    
    if not os.path.exists(filedir):
        os.makedirs(filedir)    
    removeOutput(filepath)
    df.to_csv(filepath)
    
    print("csv file saved as : {}".format(filepath))
    
    return filepath ---

S2. 데이터 전처리

결측치 제거, 일일 주가 트렌드 레이블링
이전단계에서 만들었던 데이터파일(csv파일)을 불러와서 레이블링이 진행
레이블링된 파일은 다른 디렉토리에 저장된다

def seqEnd_trend(csv_path, seq_len):
    print("Creating label . . .")
    print("type : sequence_end")
    
    # 데이터프레임으로 일일주가데이터 불러오기, 결측치 제거
    df = pd.read_csv(csv_path, parse_dates=True, index_col=0)
    df.fillna(0)
    df.reset_index(inplace=True)
    df['Date'] = df['Date'].map(mdates.date2num) # Y-M-D 포멧에서 num 포멧으로 변경
        
    # 파일을 저장할 디렉토리 명과 파일이름 지정
    filedir = os.getcwd() + '\\dataset\\labeled_data\\'
    filename = "{}_label_seq{}.txt".format(csv_path.split('\\')[-1][0:-4], seq_len) # ex) KRX_005930_label_seq30 
    filepath = filedir + filename
        
    # 디렉토리가 없을시 생성, 같은이름의 파일 제거
    if not os.path.exists(filedir):
        os.makedirs(filedir)
    removeOutput(filepath)

    # 레이블링
    for i in range(0, len(df)-int(seq_len)-1):
        tmp_df = df.iloc[i:i + int(seq_len)+1]  # seq_len+1 만큼 데이터프레임 슬라이싱
        starting = int(tmp_df["Close"].iloc[-2]) # seq 마지막날 종가
        endvalue = int(tmp_df["Close"].iloc[-1]) # seq 다음날 종가
        tmp_rtn = endvalue / starting - 1 
        
        if tmp_rtn > 0:
            label = 1
        else:
            label = 0
        # 레이블링한 sequence를 한 라인으로 파일에 입력        
        with open(filepath, 'a') as the_file:
            the_file.write("{}--{},{}".format(filename[0:-4], i, label))
            the_file.write("\n")

    print("Create label finished.")
    return filepath ---

캔들차트 생성 캔들차트 이미지를 만드는 것은 matplotlib.finance 의 candlestick2_ochl 매써드를 이용할 것이다. 레이블링 데이터와 각 캔들차트 조각 이미지를 어떻게 매칭 시키냐도 중요한 문제이다. 예를 들어 1월 1일 부터 1월 31일까지의 캔들차트 이미지를 준비했다면, 그 이미지로 다음 장열리는 날의 등락을 예측해야하므로 2월 1일(혹은 바로 다음 장 열리는날)의 등락을 0 또는 1로 레이블링 해야 할 것이다.

def ohlc2cs(csv_path, seq_len, dimension, use_volume):
    print("Converting ohlc to candlestick")

    # 데이터프레임으로 일일주가데이터 불러오기, 결측치 제거
    df = pd.read_csv(csv_path, parse_dates=True, index_col=0)
    df.fillna(0)
    df.reset_index(inplace=True)
    df['Date'] = df['Date'].map(mdates.date2num) # Y-M-D 포멧에서 num 포멧으로 변경
    
    # 파일을 저장할 디렉토리 명과 파일이름 지정
    symbol = csv_path.split('\\')[-1][0:-4]
    filedir = os.getcwd() + '\\dataset\\candle_chart\\{}\\seq{}_dim{}_vol{}\\'.format(
                                symbol, seq_len, dimension, use_volume) # ex) seq30_dim536_volFalse
    
    # 디렉토리가 없을시 생성, 이미 디렉토리가 있다면 내용물 삭제하고 다시 생성
    if not os.path.exists(filedir):
        os.makedirs(filedir)
    else:
        shutil.rmtree(filedir)
        os.makedirs(filedir)
    
    plt.style.use('dark_background') # 배경을 검은색으로
    
    for i in range(0, len(df)-int(seq_len)-1):
        tmp_df = df.iloc[i:i + int(seq_len)]
        if len(tmp_df) == int(seq_len):
            my_dpi = 109 # 자신의 모니터에 맞는 dpi 설정 27inch qhd -> 109
            fig = plt.figure(figsize=(dimension / my_dpi, dimension / my_dpi), dpi=my_dpi) # figure size 설정
            ax1 = fig.add_subplot(1, 1, 1)
            
            # Open, Close, High, Low column을 arguments로 넘겨준다.
            candlestick2_ochl(ax1, tmp_df['Open'], tmp_df['Close'], tmp_df['High'], tmp_df['Low'],
                                width=1, colorup='#ed3738', colordown='#0a7df3')
            
            # 학습에 필요한 부분만 남도록 정리
            ax1.grid(False)
            ax1.set_xticklabels([])
            ax1.set_yticklabels([])
            ax1.xaxis.set_visible(False)
            ax1.yaxis.set_visible(False)
            ax1.axis('off')
                        
            if use_volume: # volume 정보를 사용할 경우에
                ax2 = ax1.twinx()
                bc = volume_overlay(ax2, tmp_df['Open'], tmp_df['Close'], tmp_df['Volume'],
                                        colorup='#ed3738', colordown='#0A7df3',
                                        alpha=0.5,  width=1)
                ax2.add_collection(bc)
                
                # 학습에 필요한 부분만 남도록 정리
                ax2.grid(False)
                ax2.set_xticklabels([])
                ax2.set_yticklabels([])
                ax2.xaxis.set_visible(False)
                ax2.yaxis.set_visible(False)
                ax2.axis('off')
            
            filename = '{}-{}'.format(symbol, i)
            filepath = filedir + filename
            fig.savefig(filepath, pad_inches=0, transparent=False)
            plt.close(fig)
            
    print("Converting ohlc to candlestick finished") ---

데이터 전처리 작업에서 체계적으로 디렉토리를 구성하는 것은 매우 중요하다. 여러가지 데이터들을 일정한 기준에 맞추어 정리해놓지 않는다면 데이터를 사용하려고 할때 상당히 비효율 적이고 작업에 혼선이 생길 수 있기 때문이다. 파일디렉토리 구성을 다음과 같이 구성했다.

main 부분은 다음과 같다, 변수 부분을 조정하여, 종목이나 이미지 크기, 기간, 볼륨 사용 여부등 을 조정할 수 있다.

# main
market = 'KRX'
ticker = '005930'
seq_len = '30'
head_date = '2020-01-01'
tail_date = '2022-01-01'
dimension = 536
use_volume = False

data_csv_path = csv_initiator(market, ticker, head_date, tail_date) # 이 함수를 추가해서 FDR 이용하여 데이터를 불러오고 csv 파일로 저장하는 부분을 추가
label_set_path = seqEnd_trend(data_csv_path, seq_len)
ohlc2cs(data_csv_path, seq_len, dimension, use_volume) ---

소스코드 & 코드설명

SeungHyuck, Jung