Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import re | |
| import nltk | |
| nltk.download('stopwords') | |
| from nltk.corpus import stopwords | |
| from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords | |
| import spacy | |
| import torch | |
| from transformers import T5ForConditionalGeneration,T5Tokenizer | |
| import random | |
| from operator import itemgetter | |
| #Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen | |
| def doc_text_preprocessing(ser): | |
| nlp=spacy.load("en_core_web_md", exclude=['parser','ner','textcat']) | |
| """text processing steps""" | |
| import re | |
| stop_words=set(stopwords.words('english')) | |
| single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c) | |
| to_lower_func=lambda c: c.lower() | |
| lemma_text=[preprocess_string( | |
| ' '.join([token.lemma_ for token in desc] | |
| ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags, | |
| strip_multiple_whitespaces,single_letter_replace,to_lower_func] | |
| ) for desc in ser.apply(lambda x: nlp(x))] | |
| tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text] | |
| return tokenize_text | |
| class Title_Generator: | |
| def __init__(self, path, df): | |
| self.model = T5ForConditionalGeneration.from_pretrained(path) | |
| self.tokenizer = T5Tokenizer.from_pretrained(path) | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.model.to(self.device) | |
| self.game_df = df | |
| self.title_iter = -1 | |
| self.out_titles = None | |
| self.best_title = None | |
| self.description = None | |
| self.nlp = spacy.load("en_core_web_md") | |
| def candidate_generator(self, description): | |
| text = "headline: " + description | |
| encoding = self.tokenizer.encode_plus(text, return_tensors = "pt") | |
| input_ids = encoding["input_ids"].to(self.device) | |
| attention_masks = encoding["attention_mask"].to(self.device) | |
| candidates = [] | |
| beam_outputs = self.model.generate( | |
| input_ids = input_ids, | |
| attention_mask = attention_masks, | |
| max_length = 64, | |
| num_beams = 16, | |
| num_beam_groups=4, | |
| num_return_sequences=8, | |
| diversity_penalty=.1, | |
| repetition_penalty=.9, | |
| early_stopping = True) | |
| for result in beam_outputs: | |
| res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','') | |
| candidates.append(res) | |
| return candidates, description | |
| def candidate_score(self,candidates,ex_check=None): | |
| if ex_check != None: | |
| pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))") | |
| desc = re.sub(pat, "__", candidates[1]) | |
| else: | |
| pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))") | |
| desc = re.sub(pat, "__", candidates[1]) | |
| if re.search(re.compile(re.escape("__")), desc): | |
| reg = re.compile("("+"|".join(ex_check) + ")") | |
| hold = candidates[0] | |
| gen_desc = re.sub(re.compile(re.escape("__")),"",desc) | |
| candidates = self.candidate_generator(gen_desc) | |
| next = [cand for cand in candidates[0]+hold if not reg.search(cand)] | |
| candidates = (next, desc) | |
| #check for existing games and duplicates | |
| #transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o | |
| def transform(L): | |
| S = set(L) | |
| return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())] | |
| clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0])) | |
| clean_cand_step = transform(clean_cand_step) | |
| clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and', | |
| re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"", | |
| re.sub(re.compile("(?<=[a-z])'S"),"'s", | |
| re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x)))) | |
| for x in clean_cand_step] | |
| clean_cand = [] | |
| for cand in clean_cand_step: | |
| try: | |
| inter = cand.split(":") | |
| if inter[0].lower()==inter[1].lower(): | |
| clean_cand.append(inter[0]) | |
| else: | |
| clean_cand.append(cand) | |
| except: | |
| clean_cand.append(cand) | |
| #text processing | |
| token_cand = doc_text_preprocessing(pd.Series(clean_cand)) | |
| token_art = doc_text_preprocessing(pd.Series([candidates[1]])) | |
| sim = [self.nlp(title) for title in [" ".join(title) for title in token_cand]] | |
| doc = self.nlp(" ".join(token_art[0])) | |
| #scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't) | |
| #it assigns a random probability to populate | |
| scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]] | |
| out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True) | |
| pat = re.compile("(?<=[!.?])(?=[^\s])") | |
| pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)") | |
| pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame") | |
| pat4 = re.compile("[Tt]he __") | |
| pat5 = re.compile("__ [Gg]ame") | |
| pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __") | |
| desc = re.sub(pat," ",candidates[1]) | |
| desc = re.sub(pat2,"",desc) | |
| desc = re.sub(pat3,"",desc) | |
| desc = re.sub(pat4,"__",desc) | |
| desc = re.sub(pat5,"__",desc) | |
| desc = re.sub(pat6,"__",desc) | |
| return {'text':desc,'titles':out_titles} | |