import re
class RegexPatterns:
MENTIONS = r'@(\w+)'
HASHTAGS = r'#(\w+)'
PHONE_NUMBERS = re.compile(r'\d{3}.\d{3}.\d{4}')
# STRIP HTML TAGS
HTML = r'&(\w+;)'
#remove special characters, numbers, punctuations
REMOVE_PAT1 = r"[^a-zA-Z#]"
REMOVE_PAT2 = r"[^A-Za-z0-9^,!.\/'+-=]"
REMOVE_PAT2 = r"[^a-zA-Z0-9#@']"
# Strip Whitespace more than 2 characters
STRIP_SPACE = "\s{2,}"
# URLS
URL1 = r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'
URL2 = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
DATETIME = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})')
EMAILS = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
# Words 1-3 characters
SHORT_WORDS = r'(\b\w{1,3}\b)'
class TextBlobSentiment:
def __init__(self, text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"couldn't", "could not ", text)
text = re.sub(r"(\'s+)", " is ", text)
text = re.sub(r"(\'ve+)", " have ", text)
text = re.sub(r"(n't+)", " not ", text)
text = re.sub(r"(i'm+)", "i am ", text)
text = re.sub(r"(\'re+)", " are ", text)
text = re.sub(r"(\'d+)", " would ", text)
text = re.sub(r"(\'ll+)", " will ", text)
text = re.sub(r" (9 11+) ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r'&(\w+;)', '', text)
text = re.sub(r'(https?://[^\s<>"]+|www\.[^\s<>"]+)', '', text)
text = re.sub(r"#(\w+)", '', text)
text = re.sub(r'@(\w+)', text)
text = text.translate(string.punctuation)
stopwords = nltk.corpus.stopwords.words('english')
tokens = re.split(r'\W+', text)
text = [word for word in tokens if word not in stopwords]
text = re.sub(r"\s{2,}", ' ', text)
self.text = text
@contextmanager
def timer(name="duration"):
'Utility function for timing execution'
start=time.time()
yield
duration=time.time()-start
print("{0}: {1} second(s)".format(name,duration))
engine = create_engine('postgresql://', creator=connect)
Session = sessionmaker(bind=engine)
@contextmanager
def _session_scope():
session = Session()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()
Create SQL table from file
def create_sql_table(inputfile, dbname):
reader = csv.reader(open(inputfile, 'r', encoding='utf-8'), delimiter=',')
headers = next(reader)
# build the table from the csv headers
do_this_sqlite_1 = "CREATE TABLE IF NOT EXISTS {} ( ".format(dbname)
for headerName in headers:
do_this_sqlite_1 += "'" + headerName + "',"
# remove last comma
do_this_sqlite_1 = do_this_sqlite_1[:-1]
do_this_sqlite_1 += ");"
c.execute(do_this_sqlite_1)
do_this_sqlite = "INSERT INTO {} VALUES ( ".format(dbname)
# are preferred to avoid sql injection
do_this_sqlite += "?," * len(headers)
do_this_sqlite = do_this_sqlite[:-1]
do_this_sqlite += ")"
for row in reader:
c.execute(do_this_sqlite, row)
conn.commit()
class Decorators:
def timefunc(func):
def f(*args, **kwargs):
from time import time
start = time()
rv = func(*args, **kwargs)
finish = time()
print('Run time is.', finish - start)
return rv
return f
# HIGHER ORDER DECORATOR
def ntimes(n):
def inner(f):
def wrapper(*args, **kwargs):
for _ in range(n):
print('running {.__name__}'.format(f))
rv = f(*args, **kwargs)
return rv
return wrapper
return inner
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import nltk
from textblob import TextBlob
class Vader:
def __init__(self, text):
self.text = text
self.analyser = SentimentIntensityAnalyzer(self.text)
def positive_vader(self):
sent = self.analyser.polarity_scores()
positive = sent['pos']
return positive
def negative_vader(self):
sent = self.analyser.polarity_scores()
negative = sent['neg']
return negative
def neutral_vader(self):
sent = self.analyser.polarity_scores()
neutral = sent['neu']
return neutral
def compound_vader(self):
sent = self.analyser.polarity_scores()
compound = sent['compound']
return compound
class TextBlobSentiment:
def __init__(self, text):
self.text = text
self.analysis = TextBlob(self.text)
def textblob_sentiment(self):
# analysis = TextBlob(text)
if self.analysis.sentiment.polarity > 0:
return 1
elif self.analysis.sentiment.polarity == 0:
return 0
else:
return -1
def textblob_sentiment_raw(self):
# analysis = TextBlob(self.text)
sent = self.analysis.sentiment.polarity
return str(sent)
Create html table in a Dash app from a pandas datframe.
def generate_table(dataframe, max_rows=10):
return html.Table([html.Tr([html.Th(col) for col in dataframe.columns])] +\
[html.Tr([html.Td(dataframe.iloc[i][col]) for col in dataframe.columns]) \
for i in range(min(len(dataframe), max_rows))])
import re
import nltk
class TwitterHelpers:
def preprocessed_tweets(self, tweets):
stopwords = nltk.corpus.stopwords.words('english')
text = "".join([word for word in tweets if word not in string.punctuation])
tokens = re.split(r'\W+', text)
return [str(word) for word in tokens if word not in stopwords]
def remove_stopwords(self, text):
stopwords = nltk.corpus.stopwords.words('english')
tokens = re.split(r'\W+', text)
return [str(word) for word in tokens if word not in stopwords]
def tweet_length(self, text):
tweet_length = len(text) - text.count(' ')
return str(tweet_length)
def count_punc(self, text):
count = sum([1 for char in text if char in string.punctuation])
return str(round(count/(len(text) - text.count(" ")), 3)*100)
def get_unique_mentions_count(self, text):
word_count_dict = Counter()
for w in text.split(' '):
word_count_dict[w] += 1
return str(word_count_dict)
def count_words_fast(self, text):
text = text.lower()
skips = [".", ", ", ":", ";", "'", '"']
for ch in skips:
text = text.replace(ch, "")
word_counts = Counter(text.split(" "))
return word_counts
def count_words(self, text):
skips = [".", ", ", ":", ";", "'", '"']
for ch in skips:
text = text.replace(ch, "")
word_counts = {}
for word in text.split(" "):
if word in word_counts:
word_counts[word]+= 1
else:
word_counts[word]= 1
return word_counts
class TextUtils(object):
def normalize_text(self, text):
text = normalize('NFKD', text)
text = text.strip()
text = text.lower()
return text
def tweet_length(self, text):
tweet_length = len(text) - text.count(' ')
return str(tweet_length)
def count_punc(self, text):
count = sum([1 for char in text if char in string.punctuation])
return str(round(count/(len(text) - text.count(" ")), 3)*100)
def df_count_words(self, series):
all_words = []
for line in list(df['hashtags']):
words = line.split(', ')
for word in words:
all_words.append(word)
counter = Counter(all_words).most_common(10)
print(counter)
def remove_all_punctuation(self, text):# Obsolete
'''Uses the string module.
Removes the following special characters: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Removes all special character punctuation from text.
:text: list, pandas DataFrame, ndarray
'''
text = text.translate(string.punctuation)
return str(text)
class JobStops:
stop_words = ['i', 'im', 'or', 'is', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its',
'itself', 'they', 'them', 'their', 'theirs', 'themselves',
'what', 'which', 'who', 'whom', 'this', 'that', 'these',
'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because',
'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after',
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can',
'will', 'just', 'don','should', 'now','https',"'s",'...', "whats'",
"rt","whats","n't","de","'m","un","en","``","dedic","twittermoments",
"amp","e","y","o","ce","retweet","sur","na","el","1","2","3","4",
"5","6","7","8","9","0","ca","nao","se","com","los","u","des","-",
"--","'","''","la","como","con","segundo",'de', 'la', 'que', 'el',
'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para',
'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus',
'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando',
'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde',
'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les',
'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto',
'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras',
'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada',
'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo',
'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas',
'nosotras', 'vosostros', 'vosostras', 'os', 'mío', 'mía', 'míos',
'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos',
'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro',
'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estás',
'está', 'estamos', 'estáis', 'están', 'esté', 'estés', 'estemos',
'estéis', 'estén', 'estaré', 'estarás', 'estará', 'estaremos',
'estaréis', 'estarán', 'estaría', 'estarías', 'estaríamos',
'estaríais', 'estarían', 'estaba', 'estabas', 'estábamos',
'estabais', 'estaban', 'estuve', 'estuviste', 'estuvo', 'estuvimos',
'estuvisteis', 'estuvieron', 'estuviera', 'estuvieras', 'estuviéramos',
'estuvierais', 'estuvieran', 'estuviese', 'estuvieses', 'estuviésemos',
'estuvieseis', 'estuviesen', 'estando', 'estado', 'estada', 'estados',
'estadas', 'estad', 'he', 'has', 'ha', 'hemos', 'habéis', 'han', 'haya',
'hayas', 'hayamos', 'hayáis', 'hayan', 'habré', 'habrás', 'habrá', 'habremos',
'habréis', 'habrán', 'habría', 'habrías', 'habríamos', 'habríais', 'habrían',
'había', 'habías', 'habíamos', 'habíais', 'habían', 'hube', 'hubiste', 'hubo',
'hubimos', 'hubisteis', 'hubieron', 'hubiera', 'hubieras', 'hubiéramos',
'hubierais', 'hubieran', 'hubiese', 'hubieses', 'hubiésemos', 'hubieseis',
'hubiesen', 'habiendo', 'habido', 'habida', 'habidos', 'habidas', 'soy',
'eres', 'es', 'somos', 'sois', 'son', 'sea', 'seas', 'seamos', 'seáis',
'sean', 'seré', 'serás', 'será', 'seremos', 'seréis', 'serán', 'sería',
'serías', 'seríamos', 'seríais', 'serían', 'era', 'eras', 'éramos', 'erais',
'eran', 'fui', 'fuiste', 'fue', 'fuimos', 'fuisteis', 'fueron', 'fuera',
'fueras', 'fuéramos', 'fuerais', 'fueran', 'fuese', 'fueses', 'fuésemos',
'fueseis', 'fuesen', 'sintiendo', 'sentido', 'sentida', 'sentidos',
'sentidas', 'siente', 'sentid', 'tengo', 'tienes', 'tiene', 'tenemos',
'tenéis', 'tienen', 'tenga', 'tengas', 'tengamos', 'tengáis', 'tengan',
'tendré', 'tendrás', 'tendrá', 'tendremos', 'tendréis', 'tendrán',
'tendría', 'tendrías', 'tendríamos', 'tendríais', 'tendrían', 'tenía',
'tenías', 'teníamos', 'teníais', 'tenían', 'tuve', 'tuviste', 'tuvo',
'tuvimos', 'tuvisteis', 'tuvieron', 'tuviera', 'tuvieras', 'tuviéramos',
'tuvierais', 'tuvieran', 'tuviese', 'tuvieses', 'tuviésemos', 'tuvieseis',
'tuviesen', 'teniendo', 'tenido', 'tenida', 'tenidos', 'tenidas', 'tened',
"ve","dia","algun","ningun","pregunta","segunda","bugun","mas","da",
"alguna","si","bur","bu","icin","bir","um","know","mais","pra","time","q","em",
"re","11","isnt","wan","ver","like","'re","m","'ve","bec","n","twt","kca","c","a",
"b","d","e","f","g","h","i","j","k","l","m","n","o",
"p","q","r","s","t","u","v","w","x","y","z"]
from unicodedata import normalize
from datetime import datetime
import string
from textblob import TextBlob
import re
import os
import emojis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from .stopwords_py import Stops
#note: depending on how you installed (e.g., using source code download versus pip install), you may need to import like this:
#from vaderSentiment import SentimentIntensityAnalyzer
# import spacy
# nlp = spacy.load('en_core_web_sm')
class OtherRegexMeta(object):
tag2 = r'<.*?>'
tag3 = r' '
URL1 = r'(https?://(www\.)?(\w+)(\.\w+))'
URL3 = r'(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])'
EMAIL2 = r'(\w+@\w+\.{1}\w+)'
class MetaRegex(object):
# Twitter Mentions and Hashtags
HTML = r'&(\w+;)'
MENTIONS = r'@(\w+)'
HASHTAGS = r'#(\w+)'
# Phone Number's
PHONE_NUMBERS = r'(\d{3}.\d{3}.\d{4}|\d{3}.\d{4})'
# STRIP HTML TAGS
HTML_4CHAN_1 = r'&#([0-9]+;)'
HTML_4CHAN_MAIN = r'&(\w+;|[#0-9]+;)'
ALL_HTML_TAGS = r'(</?.*?>)'# Every tag enclose in a <>
#remove special characters, numbers, punctuations
REMOVE_PAT1 = r"[^a-zA-Z#]"
REMOVE_PAT2 = r"[^A-Za-z0-9^,!.\/'+-=]"
REMOVE_PAT2 = r"[^a-zA-Z0-9#@']"
# White space 2x or more
STRIP_SPACE = r"\s{2,}"# Whitespace more than 2 chars
# URL's
URL1 = r'(https?://[A-Za-z0-9./]*)'
# URL1 = r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'#url's
# URL1 = r'(https?://([^\s<>"]+)|www\.([^\s<>"]+))'#url's
# URL1 = r'(https?://[^\s<>"]+|www\.[^\s<>"]+.\w+/[\w\d]+/\w\d)'#url's
# Datatime
DATE_TIME = r'(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})'
#IP Address
IP = r'\b(\d{1,4}[.]\d{1,4}[.]\d{1,4}[.]\d{1,4})\b'
# EMAIL Addresses
EMAILS = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
# Words 1-3 characters
SHORT_WORDS = r'(\b\w{1,3}\b)'
NINE_NUMS_4CHAN = r'(\d{9})'
LINKS = r'…'
# Happy Emoticons
emoticons_happy = set([
':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
'=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
'<3'
])
# Sad Emoticons
emoticons_sad = set([
':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
':c', ':{', '>:\\', ';('
])
emoticons = emoticons_happy.union(emoticons_sad)
#Emoji patterns
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
class MetaFuncs(object):
def substitute_text_pattern(pat, replacement, text):
text = re.sub(pat, replacement, text)
return str(text)
def count_text_length(text):
text = len(text) - text.count(' ')
return str(text)
def count_text_punctuation(text):
count = sum([1 for char in text if char in string.punctuation])
return str(round(count/(len(text) - text.count(" ")), 3)*100)
def findall_text_pattern(pat, text):
'''Uses re.findall
:text: list, pandas DataFrame, ndarray
'''
pattern = re.findall(pat, text)
return str(pattern)
def count_emojis(text: str):
'''
import emojis
'''
emoj = emojis.count(text)
return emoj
def count_unique_emojis(text: str, unique=True):
'''
https://emojis.readthedocs.io/en/latest/api.html#sample-code
import emojis
'''
emoj = emojis.count(text)
return emoj
def decode_emojis(text: str):
'''
https://emojis.readthedocs.io/en/latest/api.html#sample-code
import emojis
'''
emoj = emojis.decode(text)
def strip_stopwords(text: str):
words = set(Stops.stop_words)
filtered_words = [w for w in text.split(' ') if not w in words]
return str(filtered_words)
def clean_tweets(tweet):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(tweet)
#after tweepy preprocessing the colon symbol left remain after removing mentions
tweet = re.sub(r':', '', tweet)
tweet = re.sub(r'…', '', tweet)
#replace consecutive non-ASCII characters with a space
tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
#remove emojis from tweet
tweet = emoji_pattern.sub(r'', tweet)
#filter using NLTK library append it to a string
filtered_tweet = [w for w in word_tokens if not w in stop_words]
filtered_tweet = []
#looping through conditions
for w in word_tokens:
#check tokens against stop words , emoticons and punctuations
if w not in stop_words and w not in emoticons and w not in string.punctuation:
filtered_tweet.append(w)
return ' '.join(filtered_tweet)
def remove_pattern(input_txt, pattern):
r = re.findall(pattern, input_txt)
for i in r:
input_txt = re.sub(i, '', input_txt)
return input_txt
def clean_tweets(lst):
# remove twitter Return handles (RT @xxx:)
lst = np.vectorize(remove_pattern)(lst, "RT @[\w]*:")
# remove twitter handles (@xxx)
lst = np.vectorize(remove_pattern)(lst, "@[\w]*")
# remove URL links (httpxxx)
lst = np.vectorize(remove_pattern)(lst, "https?://[A-Za-z0-9./]*")
# remove special characters, numbers, punctuations (except for #)
lst = np.core.defchararray.replace(lst, "[^a-zA-Z#]", " ")
return lst
def textblob_sentiment_raw(tweets):
analysis = TextBlob(tweets)
sent = analysis.sentiment.polarity
return str(sent)
def vader_sentiment_raw(text):
analyzer = SentimentIntensityAnalyzer()
sent = analyzer.polarity_scores(text)['compound']
return str(sent)
# analyzer = SentimentIntensityAnalyzer()
# for sentence in sentences:
# vs = analyzer.polarity_scores(sentence)
# print("{:-<65} {}".format(sentence, str(vs)))
def vader_sentiment_analyzer_scores(text):
score = analyser.polarity_scores(text)
lb = score['compound']
if lb >= 0.05:
return 1
elif (lb > -0.05) and (lb < 0.05):
return 0
else:
return -1
def google_translate_sentiment_analyzer_scores(text, engl=True):
if engl:
trans = text
else:
trans = translator.translate(text).text
score = analyser.polarity_scores(trans)
lb = score['compound']
if lb >= 0.05:
return 1
elif (lb > -0.05) and (lb < 0.05):
return 0
else:
return -1
class ForChanText(MetaRegex, MetaFuncs):
def __init__(self, data):
self.data = data
def __repr__(self):#pg 879/1594
return '[ForChanText: %s]' % (self.data)
def __str__(self):#pg 879/1594
return '[ForChanText: %s]' % (self.data)
@classmethod
def extract_url(cls, data):
return cls.findall_text_pattern(cls.URL1, data)
@classmethod
def extract_phone_numbers(cls, data):
return cls.findall_text_pattern(cls.PHONE_NUMBERS, data)
@classmethod
def extract_ip_addrs(cls, data):
return cls.findall_text_pattern(cls.IP, data)
@classmethod
def extract_email_addrs(cls, data):
return cls.findall_text_pattern(cls.EMAILS, data)
@classmethod
def extract_text_length_count(cls, data):
return cls.count_text_length(data)
@classmethod
def extract_text_punctuation_count(cls, data):
return cls.count_text_punctuation(data)
@classmethod
def extract_text_emoji_count(cls, data):
return cls.count_emojis(data)
@classmethod
def strip_html(cls, data):
data = cls.substitute_text_pattern(cls.ALL_HTML_TAGS, ' ', str(data))
data = cls.substitute_text_pattern(cls.HTML_4CHAN_MAIN, ' ', data)
data = cls.substitute_text_pattern(cls.NINE_NUMS_4CHAN, ' ', data)
data = cls.strip_stopwords(data)
data = cls.substitute_text_pattern(cls.STRIP_SPACE, '', data)
data = data.strip()
return data
@classmethod
def extract_textblob_sentiment(cls, data):
data = cls.substitute_text_pattern(cls.ALL_HTML_TAGS, ' ', str(data))
data = cls.substitute_text_pattern(cls.HTML_4CHAN_MAIN, ' ', data)
data = cls.substitute_text_pattern(cls.NINE_NUMS_4CHAN, ' ', data)
data = cls.substitute_text_pattern(cls.URL1, ' ', data)
data = cls.substitute_text_pattern(cls.STRIP_SPACE, '', data)
data = data.strip()
data = data.lower()
# data = cls.strip_stopwords(data)
data = cls.textblob_sentiment_raw(data)
return data
@classmethod
def extract_vader_sentiment(cls, data):
data = cls.substitute_text_pattern(cls.ALL_HTML_TAGS, ' ', str(data))
data = cls.substitute_text_pattern(cls.HTML_4CHAN_MAIN, ' ', data)
data = cls.substitute_text_pattern(cls.NINE_NUMS_4CHAN, ' ', data)
data = cls.substitute_text_pattern(cls.URL1, ' ', data)
data = cls.substitute_text_pattern(cls.STRIP_SPACE, '', data)
data = data.strip()
data = data.lower()
# data = cls.strip_stopwords(data)
data = cls.vader_sentiment_raw(data)
return data
class TwitterText(MetaRegex, MetaFuncs):
def __init__(self, twitter_stream: str):
self.twitter_stream = twitter_stream
def __repr__(self):
return f'TPipe: {self.twitter_stream!r}'
@classmethod
def extract_mentions(cls, data):
return cls.findall_text_pattern(cls.MENTIONS, data)
@classmethod
def extract_hashtags(cls, data):
return cls.findall_text_pattern(cls.HASHTAGS, data)
@classmethod
def extract_textblob_sentiment(cls, data):
data = cls.substitute_text_pattern(cls.ALL_HTML_TAGS, ' ', str(data))
data = cls.substitute_text_pattern(cls.HTML, ' ', data)
data = cls.substitute_text_pattern(cls.MENTIONS, ' ', data)
data = cls.substitute_text_pattern(cls.URL1, ' ', data)
data = cls.substitute_text_pattern(cls.IP, ' ', data)
data = cls.substitute_text_pattern(cls.EMAILS, ' ', data)
# data = cls.substitute_text_pattern(cls.SHORT_WORDS, ' ', data)#IDK about this one
data = cls.substitute_text_pattern(cls.LINKS, ' ', data)
data = cls.substitute_text_pattern(cls.emoji_pattern, ' ', data)
# data = cls.substitute_text_pattern(cls.MENTIONS, ' ', data)
# data = cls.substitute_text_pattern(cls.URL1, ' ', data)
data = cls.substitute_text_pattern(cls.STRIP_SPACE, '', data)
data = data.strip()
data = data.lower()
# data = cls.strip_stopwords(data)
data = cls.textblob_sentiment_raw(data)
return data
@classmethod
def extract_vader_sentiment(cls, data):
data = cls.substitute_text_pattern(cls.ALL_HTML_TAGS, ' ', str(data))
data = cls.substitute_text_pattern(cls.HTML, ' ', data)
data = cls.substitute_text_pattern(cls.MENTIONS, ' ', data)
data = cls.substitute_text_pattern(cls.URL1, ' ', data)
data = cls.substitute_text_pattern(cls.IP, ' ', data)
data = cls.substitute_text_pattern(cls.EMAILS, ' ', data)
# data = cls.substitute_text_pattern(cls.SHORT_WORDS, ' ', data)#IDK about this one
data = cls.substitute_text_pattern(cls.LINKS, ' ', data)
data = cls.substitute_text_pattern(cls.emoji_pattern, ' ', data)
# data = cls.substitute_text_pattern(cls.MENTIONS, ' ', data)
# data = cls.substitute_text_pattern(cls.URL1, ' ', data)
data = cls.substitute_text_pattern(cls.STRIP_SPACE, '', data)
data = data.strip()
data = data.lower()
# data = cls.strip_stopwords(data)
data = cls.vader_sentiment_raw(data)
return data
if __name__ == "__main__":
df = '''
310674 2001-09-11 16:12:05 Skytel [005042977] A ALPHA Phillip.Blakeman@usarec.army.mil|hey| Where are my DONUTS? Mimi
26510 2001-09-11 08:14:59 Skytel [005206260] B ALPHA IngallsBW@hqmc.usmc.mil|Warning Warning!! DCID EWG MTG This Morning at 0900|I will go unless otherwi se advised. Bryan (69
28579 2001-09-11 08:24:44 Skytel [005206261] B ALPHA IngallsBW@hqmc.usmc.mil|FW: Warning Warning!! DCID EWG MTG This Morning at 0900|Ray,be at the CMO Of fices for the subject meeting. Bry
87201 2001-09-11 10:21:42 Skytel [004690665] C ALPHA jfraller@usss.treas.gov|(no subject)|Car bomb 15th and F, NW. HIjacked plane enroute DC.
107207 2001-09-11 10:49:59 Skytel [005201647] D ALPHA jtillman@usss.treas.gov|Bob.....following have been accounted for |BROWN, CASSITY, DADE, GROOVER, KE NDRICK, KLENNER, LEWIS, WOLFEN, BOWSER, ALLCMCA PERSONNEL --------------554DC0DF3A8507539115AA1F Content-Type: text/x-vcard;
107736 2001-09-11 10:50:44 Skytel [005055742] D ALPHA wenloe@usss.treas.gov|(no subject)|ALL SFO AGENTS AND SUPERVISORS: CALL INTO THE DUTY DESK IMMEDIATE LY BY TELEPHONE OR RADIO.
141555 2001-09-11 11:33:59 Skytel [005081201] A ALPHA dholland@associates.usss.treas.gov|Urgent!|ALL NEW YORK SECRET SERVICES PERSONNEL -- DO NOT GO INTO NEW YORK CITY! GO TO THE NEAREST RO! ANY QUESTIONS CALL HEADQUATERS AT406-5988. --------------E9F8E9579C859A005E8589A3 C
1275166 2001-09-11 14:58:57 Skytel [004690665] C ALPHA jfraller@usss.treas.gov|FYI|USSS K-9 alerted on cars at 10th &and 18th & Penn
'''
df = 'www.textstuff.com ......... <p class="dasfdawsfa"> fadfsdfsgdshsfdghsdf </p>]'
t1 = ForChanText.extract_url(df)
# t1 = t1.tweet_length(df)
print(t1)
# class Spipe:
# def __init__(self, twitter_stream):
# self.twitter_stream = twitter_stream
# def __repr__(self):
# return f'TPipe: {self.twitter_stream!r}'
# @classmethod
# def spacy_stream_pipeline(cls, stream):
# cls.stage1(stream)
# @staticmethod
# def stage1(text):
# for doc in nlp.pipe(text):
# print([(ent.text, ent.label_) for ent in doc.ents])
# class TwitterPostprocessing(object):
# # def __init__(self, df):
# # df.set_index('idstr', inplace=True)
# # df['unix'] = pd.to_datetime(df['unix'])
# # df.insert(1, 'date', df['unix'].dt.date)
# # df.insert(2, 'time', df['unix'].dt.time)
# # df.drop(['unix'], axis=1, inplace=True)
# # df['hashtags'] = df['hashtags'].str.replace('[', '').str.replace(']', '')
# # df['mentions'] = df['mentions'].str.replace('[', '').str.replace(']', '')
# # self.df = df
# def tag_count(self, series):
# series = series.str.replace('[', '').str.replace(']', '')
# all_words = []
# for line in list(series):
# words = line.split(', ')
# for word in words:
# all_words.append(word)
# counter = sorted(Counter(all_words).elements())
# return pd.Series(counter)
# def sub_html(pat, replacement, text):
# text = re.sub(pat, replacement, text)
# text = text.strip()
# text = text.lower()
# STRIP_SPACE = "\s{2,}"
# text = re.sub(STRIP_SPACE, '', text)
# return str(text)
# def textblob_sentiment_raw(tweets):
# REMOVE_PAT2 = r"[%$*;\"_(),!.\/'+-=]"
# tweets = re.sub(REMOVE_PAT2, '', tweets)
# STRIP_SPACE = "\s{2,}"
# tweets = re.sub(STRIP_SPACE, '', tweets)
# analysis = TextBlob(tweets)
# sent = analysis.sentiment.polarity
# return str(sent)
# def preprocessed_tweets(tweets):
# # tweets = tweets.strip()
# tweets = tweets.lower()
# tweets = tweets.replace(r'#(\w+)', ' ')
# tweets = tweets.replace(r'&(\w+)', ' ')
# tweets = tweets.replace(r'@(\w+)', ' ')
# tweets = tweets.replace(r'\s{2,}', ' ')
# # stopwords = nltk.corpus.stopwords.words('english')
# text = "".join([word for word in tweets if word not in string.punctuation])
# # tokens = re.split(r'\W+', text)
# # text = [word for word in tokens if word not in stopwords]
# analysis = TextBlob(text)
# sent = analysis.sentiment.polarity
# return pd.Series(sent)