SemEval2018-Task2-EmojiDetection/preprocess.py at master · shuningjin/SemEval2018-Task2-EmojiDetection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/python
# -*- coding: UTF-8 -*-

###########################
# SemEval-2018 Task 2:
# Multilingual Emoji Detection
# Team: Duluth UROP
# Author: Shuning Jin
# Environment: Python 3.6
# Date: 2018-05-20
# Update: 2020-02-12
###########################

''' Description
File: preprocess.py
Preprocessing text

normalize to lowercase
deal with punctuation, non-ASCII (remove or replace)
tokenize and vectorize
generate scipy spase matrices
'''

import sys
import re
import time
import os
import argparse
from shutil import copyfile, copy
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import save_npz


def handle_arguments(cl_arguments):
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--train_text", type=str, required=True, default=None, help="",)
    parser.add_argument("--train_label", type=str, required=True, default=None, help="",)
    parser.add_argument("--test_text", type=str, required=False, default=None, help="",)
    parser.add_argument("--run_dir", type=str, required=True, default=None, help="",)
    return parser.parse_args(cl_arguments)


def save_sparse_matrix(path, matrix):
    from scipy.sparse import save_npz
    save_npz(path, matrix)
    print('Save text as sparse matrix to: {:s}'.format(path))


def save_label(path, label_list):
    with open(path, 'w') as f:
        for line in label_list:
            print(line, file=f)
    print('Save labels to: {:s}'.format(path))


def load_label(path):
    label_list = [int(str(line).replace('\n', '')) for line in open(path, 'r')]
    print('Load labels from: {:s}'.format(path))
    return label_list


def read_text(path):
    """
    read raw text from path, preprocess, return a list of sentences
    """

    def clean(raw):
        # lowercase
        raw = raw.lower()
        # punctuation
        raw = raw.replace(r',', ' ')  # remove comma
        # non-ASCII characters
        # description: UTF-8 literal, unicode code point, name
        # remove
        raw = raw.replace(r'…', ' ')  # '\xe2\x80\xa6', U+2026, horizontal ellipsis  # clean text
        raw = raw.replace(r'•', ' ')  # '\xe2\x80\xa2', U+2022, bullet
        raw = raw.replace(r'·', ' ')  # '\xc2\xb7', U+00B7, middle dot
        raw = raw.replace(r'・', ' ')  # '\xe3\x83\xbb', U+30FB, Katakana middle dot
        raw = raw.replace(r'，', ' ')  # '\xef\xbc\x8c', U+FF0C, fullwidth comma
        raw = raw.replace(r'—', ' ')  # '\xe2\x80\x94', U+2014, EM dash
        raw = raw.replace(r'–', ' ')  # '\xe2\x80\x93', U+2013, EN dash
        # replace with standard ASCII
        raw = raw.replace(r'’', "'")  # '\xe2\x80\x99', U+2019, right single quotation mark
        raw = raw.replace(r'‘', "'")  # '\xe2\x80\x98', U+2018, left single quotation mark
        raw = raw.replace(r'“', r'"')  # '\xe2\x80\x9c', U+201C, left double quotation mark
        raw = raw.replace(r'”', r'"')  # '\xe2\x80\x9d', U+201D, right double quotation mark
        raw = raw.replace(r'！', r'!')  # '\xef\xbc\x81', U+FF01, fullwidth exclamation mark
        return raw

    text = []
    with open(path, 'r') as f:
        for line in f:
            line = clean(line)
            line = line.strip('\n').strip(' ')
            text.append(line)
    print('Read text: {:s}. Total example: {:d}'
          .format(path, len(text)))
    return text


def main(train_text_path, train_label_path, test_text_path, runname):
    print('\n--- PHASE: PREPROCESSING ---')
    # file logic
    run_dir = os.path.join('experiment', runname)
    preprocess_dir = os.path.join(run_dir, 'preprocess')
    os.makedirs('experiment', exist_ok=True)
    os.makedirs(run_dir, exist_ok=True)
    os.makedirs(preprocess_dir, exist_ok=True)
    files = os.listdir(preprocess_dir)
    if 'train_x_dtm.npz' in files and 'test_x_dtm.npz' in files and 'train_y' in files:
        print('Preprocessed files already exists. Pass this step.')
        return

    # vectorize data: bag of n-grams
    # feature: unigram + bigram, document frequency cutoff = 5
    tokenizer = nltk.word_tokenize
    vect = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenizer, min_df=5)

    train_x = read_text(train_text_path)
    vect.fit(train_x)
    train_x_dtm = vect.transform(train_x)

    test_x = read_text(test_text_path)
    test_x_dtm = vect.transform(test_x)

    # print len(vect.get_feature_names())
    # print train_x
    # print vect.get_feature_names()

    # save text x as sparse matrix
    save_sparse_matrix(os.path.join(preprocess_dir, 'train_x_dtm.npz'), train_x_dtm)
    save_sparse_matrix(os.path.join(preprocess_dir, 'test_x_dtm.npz'), test_x_dtm)

    # copy label y
    copy(src=train_label_path, dst=os.path.join(preprocess_dir, 'train_y'))
    #train_y = load_label(train_label_path)


if __name__ == "__main__":

    args = handle_arguments(sys.argv[1:])
    train_text_path = args.train_text
    train_label_path = args.train_label
    test_text_path = args.test_text
    runname = args.run_dir

    start_time = time.time()

    main(train_text_path, train_label_path, test_text_path, runname)
    seconds = time.time() - start_time
    minutes = seconds / 60
    print("Preprocess time: {:.2f} seconds, {:.2f} minutes".format(seconds, minutes))