허깅페이스의 korean_law_open_data_precedents 사용
!pip3 install -q -U transformers==4.38.2
!pip3 install -q -U datasets==2.18.0
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.9.0
!pip3 install -q -U trl==0.7.11
!pip3 install -q -U accelerate==0.27.2
from datasets import load_dataset
dataset = load_dataset("joonhok-exo-ai/korean_law_open_data_precedents")
!git clone <https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git>
cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab_light_220429.sh
# 필요없는 칼럼 삭제 판례정보일련번호, 사건 번호 등은 필요없음
dataset['train'] = dataset['train'].remove_columns(['판례정보일련번호'])
print(dataset['train'])
dataset['train'] = dataset['train'].remove_columns(['사건번호'])
print(dataset['train'])
dataset['train'] = dataset['train'].remove_columns(['사건명'])
print(dataset['train'])
dataset['train'] = dataset['train'].remove_columns(['선고일자'])
print(dataset['train'])
dataset['train'] = dataset['train'].remove_columns(['선고'])
print(dataset['train'])
dataset['train'] = dataset['train'].remove_columns(['법원명'])
print(dataset['train'])
dataset['train'] = dataset['train'].remove_columns(['사건종류명'])
print(dataset['train'])
dataset['train'] = dataset['train'].remove_columns(['판결유형'])
print(dataset['train'])
datasets = dataset['train']
df = pd.DataFrame(datasets)
print(df)