{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#
PROSES ANOTASI KORPUS
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Import Library NLTK & CSV" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Proses Membaca Korpus" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def open_file(nameFile):\n", " all_korpus = []\n", " korpus = open(nameFile, encoding=\"utf-8\")\n", " data_korpus = korpus.readlines()\n", " for i in range(len(data_korpus)):\n", " all_korpus.append(data_korpus[i])\n", " korpus.close()\n", " return all_korpus" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['administrator sistem dan anggota staf keperawatan harus dapat menambahkan daftar siswa ke dalam kategori\\n', 'administrator sistem dan anggota staf keperawatan harus dapat menambahkan satu siswa ke dalam kategori\\n']\n" ] } ], "source": [ "korpus=open_file('619 data korpus.txt')\n", "print(korpus[0:2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Tokenisasi" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['administrator', 'sistem', 'dan', 'anggota', 'staf', 'keperawatan', 'harus', 'dapat', 'menambahkan', 'daftar', 'siswa', 'ke', 'dalam', 'kategori'], ['administrator', 'sistem', 'dan', 'anggota', 'staf', 'keperawatan', 'harus', 'dapat', 'menambahkan', 'satu', 'siswa', 'ke', 'dalam', 'kategori']]\n" ] } ], "source": [ "tokenize = []\n", "for reqs in korpus:\n", " tokenize.append(nltk.word_tokenize(reqs))\n", "print(tokenize[0:2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Pemberian Tag" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### a. Set Model dengan Tagset Fam Rashel" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from nltk.tag import CRFTagger\n", "crft = CRFTagger()\n", "crft.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### b. Tagging" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[('administrator', 'NN'), ('sistem', 'NN'), ('dan', 'CC'), ('anggota', 'NN'), ('staf', 'NN'), ('keperawatan', 'NN'), ('harus', 'MD'), ('dapat', 'MD'), ('menambahkan', 'VB'), ('daftar', 'NN'), ('siswa', 'NN'), ('ke', 'IN'), ('dalam', 'NN'), ('kategori', 'NN')], [('administrator', 'NN'), ('sistem', 'NN'), ('dan', 'CC'), ('anggota', 'NN'), ('staf', 'NN'), ('keperawatan', 'NN'), ('harus', 'MD'), ('dapat', 'MD'), ('menambahkan', 'VB'), ('satu', 'CD'), ('siswa', 'NN'), ('ke', 'IN'), ('dalam', 'NN'), ('kategori', 'NN')]]\n" ] } ], "source": [ "postag=[]\n", "for reqs in tokenize:\n", " tagging = crft.tag_sents([reqs])\n", " postag+=tagging\n", "print(postag[0:2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### c. Pemisahan Daftar Pernyataan Kebutuhan Perangkat Lunak dan Daftar Tag" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "requirements =[]\n", "taggings = []\n", "for i in range(len(postag)):\n", " reqs = [req[0] for req in postag[i]]\n", " tags = [tag[1] for tag in postag[i]]\n", " requirements.append(reqs)\n", " taggings.append(tags)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Jumlah Kata: 11391\n", "Jumlah Tag: 11391\n" ] } ], "source": [ "counterTags=0\n", "counterWords=0\n", "for reqs in requirements:\n", " for req in reqs:\n", " counterWords+=1\n", "for tags in taggings:\n", " for tag in tags:\n", " counterTags+=1\n", "print(\"Jumlah Kata: \",counterWords)\n", "print(\"Jumlah Tag: \",counterTags)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Transformasi Data Hasil Tag ke Dokumen" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "with open(\"Hasil Tag\\Tag Korpus Anotasi.txt\",\"w+\",encoding='utf-8') as file: \n", " for i in range(len(requirements)):\n", " file.write(\"\"+\"\\n\")\n", " for j in range(len(requirements[i])):\n", " file.write(requirements[i][j]+\"\\t\"+taggings[i][j])\n", " file.write(\"\\n\")\n", " file.write(\"\"+\"\\n\")\n", "file.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }