nlpperceptron-TjAUahEVyKw # @ # Original text: https://mlwave.com/online-learning-perceptron/ # Its code: # https://github.com/MLWave/online-learning-perceptron # https://github.com/MLWave/online-learning-perceptron/blob/master/perceptron.py # @ # Corresponding paper: https://link.springer.com/article/10.1007%2FBF02478259 # @ # Online Learning Perceptron algorithm in Python is implemented by using standard libraries of python # So, script is executed in PyPy with 3-4 times speed without numpy, scikit-learn, pandas, etc # This algorithm was inspired by 'online logistic regression script' from 'display advertising challenge' # Display Advertising Challenge - Kaggle, Beat the benchmark with less then 200MB of memory. # Code : https://kaggle2.blob.core.windows.net/forum-message-attachments/53646/1539/fast_solution.py # @ # Perceptron can use online learning algorithm which means perceptron learns via one sample per each moment # That means model doesn't need entire dataset(out-of-core algorithm) # So, this way is beneficial for larger dataset like predicting stock or timeseries data # Example is Vowpal Wabbit (Fast Learning) # @ # Hashing Representations # Vectorized hashing trick started from 'Vopal Webbit' # This trick sets fixed number of size whose size is the number of connection input into perceptron # So, this trick hashes all primitive feature into number under fixed size(fixed_size = 1024) # Vowpal Wabbit doesn't read all data at once into memory # So, it can train model in high speed # @ sample = "This movie sucks" fixed_size = 1024 print(sample.split()) features = [(hash(f)%fixed_size,1) for f in sample.split()] # list of tuples in form (feature_index,feature_value) print(features) # < ['This', 'movie', 'sucks'] # < [(746, 1), (981, 1), (282, 1)] # @ # Progressive test loss # If model trains sample one by one, it gradually makes train loss smaller # Model predicts with first sample not target # Then, it calculates error rate by comparing prediction and corresponding label # @ # If you use Vowpal Wabbit, you can pass dataset with low learning rate into model multiple times # @ # opts["D"] 여기에 있는 코드에서는 고정 값으로 다음의 값을 사용한다. # 2 ** 25 # 33554432 # @ # https://github.com/MLWave/online-learning-perceptron import re import random from math import exp, log from datetime import datetime from operator import itemgetter def clean(s): """ Returns a cleaned, lowercased string """ return " ".join(re.findall(r'\w+', s, flags = re.UNICODE)).lower() def get_data_tsv(loc_dataset,opts): """ Running through data in an online manner Parses a tsv file for this competition and yields label, identifier and features output: label: int, The label / target (set to "1" if test set) id: string, the sample identifier features: list of tuples, in the form [(hashed_feature_index,feature_value)] """ # Read file for e, line in enumerate(open(loc_dataset,"rb")): if e > 0: r = line.decode('utf-8').strip().split("\t") id = r[0] if opts["clean"]: try: r[2] = clean(r[2]) except: r[1] = clean(r[1]) # opts["D"] = 2 ** 25 = 33554432 = fixed size # I use hashing trick of Vowpal Wabbit if len(r) == 3: # train set features = [(hash(f)%opts["D"],1) for f in r[2].split()] label = int(r[1]) else: #test set features = [(hash(f)%opts["D"],1) for f in r[1].split()] label = 1 # If you use bigram, you hash both corresponding feature[i] and its next feature[i+1] if opts["2grams"]: for i in range(len(features)-1): features.append( (hash(str(features[i][0])+str(features[i+1][0]))%opts["D"],1)) yield label, id, features # Vector inner product between features and weights def dot_product(features,weights): """ Calculate dot product from features and weights input: features: A list of tuples [(feature_index,feature_value)] weights: the hashing trick weights filter, note: length is max(feature_index) output: dotp: the dot product """ dotp = 0 for f in features: dotp += weights[f[0]] * f[1] return dotp # Train with train dataset def train_tron(loc_dataset,opts): # Starting time of test start = datetime.now() print("\nPass\t\tErrors\t\tAverage\t\tNr. Samples\tSince Start") # Initialize weight if opts["random_init"]: random.seed(3003) weights = [random.random()] * opts["D"] else: weights = [0.] * opts["D"] # Running training passes # n_passes: the number of training for pass_nr in range(opts["n_passes"]): error_counter = 0 for e, (label, id, features) in enumerate( \ get_data_tsv(loc_dataset,opts) ): # Perceptron is one of supervised learning classifiers # This learns from previous value and then predicts # 0.5: critical value dp = dot_product(features, weights) > 0.5 # dp: perceptron predicts # label: answer from dataset # weights[feature_index] += learning_rate * error * feature_value error = label - dp # If prediction is false, perceptron updates weight if error != 0: error_counter += 1 # Updating the weights for index, value in features: weights[index] += opts["learning_rate"] * error * log(1.+value) #Reporting print("%s\t\t%s\t\t%s\t\t%s\t\t%s" % ( \ pass_nr+1, error_counter, round(1 - error_counter /float(e+1),5), e+1,datetime.now()-start)) # Perceptron stops training when error_counter == 0 and overfitting if error_counter == 0 or error_counter < opts["errors_satisfied"]: print("%s errors found during training, halting"%error_counter) break return weights # This method predicts based on learnt data from above def test_tron(loc_dataset,weights,opts): """ output: preds: list, a list with [id,prediction,dotproduct,0-1normalized dotproduct] """ start = datetime.now() print("\nTesting online\nErrors\t\tAverage\t\tNr. Samples\tSince Start") preds = [] error_counter = 0 # test data doesn't have label, so, label should be initialized by 1 for e, (label, id, features) in enumerate( \ # Load tsv data get_data_tsv(loc_dataset,opts) ): # Calculate inner product dotp = dot_product(features, weights) dp = dotp > 0.5 if dp > 0.5: # we predict positive class preds.append( [id, 1, dotp ] ) else: preds.append( [id, 0, dotp ] ) if label - dp != 0: error_counter += 1 print("%s\t\t%s\t\t%s\t\t%s" % ( error_counter, round(1 - error_counter /float(e+1),5), e+1, datetime.now()-start)) # normalizing dotproducts into between 0 and 1 # TODO: proper probability (bounded sigmoid?), # online normalization max_dotp = max(preds,key=itemgetter(2))[2] min_dotp = min(preds,key=itemgetter(2))[2] for p in preds: # appending normalized value into predictions p.append((p[2]-min_dotp)/float(max_dotp-min_dotp)) #Reporting stuff print("Done testing in %s"%str(datetime.now()-start)) return preds #Setting options opts = {} # Fixed value for hashing opts["D"] = 2 ** 25 opts["learning_rate"] = 0.1 # Maximum number of passes to run before halting opts["n_passes"] = 80 # Halt when training errors < errors_satisfied opts["errors_satisfied"] = 0 # set random weights, else set all 0 opts["random_init"] = False # clean the text a little opts["clean"] = True # add 2grams opts["2grams"] = True # training and saving model into weights # %time weights = train_tron("data/labeledTrainData.tsv",opts) # Pass Errors Average Nr. Samples Since Start # 1 5648 0.77408 25000 0:00:19.187437 # 2 3161 0.87356 25000 0:00:38.114154 # 3 2218 0.91128 25000 0:00:57.910578 # 4 1643 0.93428 25000 0:01:13.906263 # 5 1254 0.94984 25000 0:01:29.959013 # 6 1038 0.95848 25000 0:01:44.738919 # 7 805 0.9678 25000 0:01:59.575742 # 8 579 0.97684 25000 0:02:14.277759 # 9 513 0.97948 25000 0:02:29.356437 # 10 464 0.98144 25000 0:02:44.361049 # 11 367 0.98532 25000 0:02:59.225589 # 12 363 0.98548 25000 0:03:19.239056 # 13 231 0.99076 25000 0:03:35.903281 # 14 203 0.99188 25000 0:03:53.428992 # 15 160 0.9936 25000 0:04:08.202330 # 16 163 0.99348 25000 0:04:27.039214 # 17 144 0.99424 25000 0:04:43.598832 # 18 168 0.99328 25000 0:04:59.653083 # 19 99 0.99604 25000 0:05:16.492206 # 20 98 0.99608 25000 0:05:31.565497 # 21 127 0.99492 25000 0:05:46.293131 # 22 81 0.99676 25000 0:06:01.460219 # 23 73 0.99708 25000 0:06:17.186924 # 24 92 0.99632 25000 0:06:33.383587 # 25 96 0.99616 25000 0:06:50.064784 # 26 82 0.99672 25000 0:07:06.048313 # 27 41 0.99836 25000 0:07:21.237198 # 28 84 0.99664 25000 0:07:36.076427 # 29 75 0.997 25000 0:07:52.291855 # 30 66 0.99736 25000 0:08:08.568775 # 31 20 0.9992 25000 0:08:25.085804 # 32 27 0.99892 25000 0:08:41.198909 # 33 7 0.99972 25000 0:08:57.871054 # 34 46 0.99816 25000 0:09:13.094634 # 35 23 0.99908 25000 0:09:27.876432 # 36 7 0.99972 25000 0:09:42.681918 # 37 61 0.99756 25000 0:09:57.770814 # 38 7 0.99972 25000 0:10:15.091846 # 39 0 1.0 25000 0:10:30.445496 # 0 errors found during training, halting # CPU times: user 10min 17s, sys: 3.64 s, total: 10min 21s # Wall time: 10min 30s # testing and saving predictions into preds # I pass learnt data(weights) from above and option to predict %time preds = test_tron("data/testData.tsv",weights,opts) # Testing online # Errors Average Nr. Samples Since Start # 12731 0.49076 25000 0:00:17.228997 # Done testing in 0:00:17.255413 # CPU times: user 16.7 s, sys: 143 ms, total: 16.9 s # Wall time: 17.3 s preds[:10] # [['"12311_10"', 1, 77.70179894076985, 0.6557850948318409], # ['"8348_2"', 0, -99.60524984646412, 0.4754283296904748], # ['"5828_4"', 0, -0.2079441541680142, 0.5765352887259396], # ['"7186_2"', 0, -3.2577917486317594, 0.5734329831488402], # ['"12128_7"', 1, 22.66591280431018, 0.5998025805541847], # ['"2913_8"', 1, 68.48294143932256, 0.6464076711556087], # ['"4396_1"', 0, -36.2515975432851, 0.5398716773602201], # ['"395_2"', 0, -1.3169796430639447, 0.5754071776069943], # ['"10616_1"', 0, -87.47517418666499, 0.48776704505393825], # ['"9074_9"', 0, -23.497689420982187, 0.5528449552280901]] # writing kaggle submission file with open("data/submit_perceptron.csv","wb") as outfile: outfile.write('"id","sentiment"\n'.encode('utf-8')) for p in sorted(preds): outfile.write("{},{}\n".format(p[0],p[3]).encode('utf-8')) # Score: 0.95338 # 129/578 # 0.2231833910034602 import pandas as pd presult = pd.DataFrame(preds) presult.head() # < 0 1 2 3 # < 0 "12311_10" 1 77.701799 0.655785 # < 1 "8348_2" 0 -99.605250 0.475428 # < 2 "5828_4" 0 -0.207944 0.576535 # < 3 "7186_2" 0 -3.257792 0.573433 # < 4 "12128_7" 1 22.665913 0.599803 output_sentiment = presult[1].value_counts() print(output_sentiment[0] - output_sentiment[1]) # output_sentiment # 462 # 0 12731 # 1 12269 # Name: 1, dtype: int64