https://benalexkeen.com/resampling-time-series-data-with-pandas/
https://kanoki.org/2020/04/14/resample-and-interpolate-time-series-data/
ab = od[od['Order_No'].isin(_invoice_list)].sort_values('Order_Date', ascending=False)
ab.head()
import pandas as pd
df = pd.DataFrame(
{
"col_1": (0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0),
"col_2": (0.0, 0.24, 1.0, 0.0, 0.22, 3.11, 0.0),
"col_3": ("Mon", "Tue", "Thu", "Fri", "Mon", "Tue", "Thu")
}
)
mask = df['col_1'].isin([1.0]) & df['col_2'].isin([0.0])
df = df[mask]
print(df)
g1 = oh.groupby('Customer_Name')[['Total_Amt']].sum().sort_values('Total_Amt', ascending=False)[:10]
g1.head()
grades = [48, 99, 75, 80, 42, 80, 72, 68, 36, 78]
df = pd.DataFrame(
{
'ID': ["x%d" % r for r in range(10)],
'Gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'],
'ExamYear': ['2007', '2007', '2007', '2008', '2008', '2008', '2008', '2009', '2009', '2009'],
'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'],
'Participated': ['yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes'],
'Passed': ['yes' if x > 50 else 'no' for x in grades],
'Employed': [True, True, True, False, False, False, False, True, True, False],
'Grade': grades
}
)
df.groupby('ExamYear').agg({'Participated': lambda x: x.value_counts()['yes'],
'Passed': lambda x: sum(x == 'yes'),
'Employed': lambda x: sum(x),
'Grade': lambda x: sum(x) / len(x)})
# Participated Passed Employed Grade
# ExamYear
# 2007 3 2 3 74.000000
# 2008 3 3 0 68.500000
# 2009 3 2 2 60.666667
This and value_counts() achieve the same thing.
https://stackoverflow.com/questions/47320572/pandas-groupby-and-count#47320593
df.groupby(['num_legs', 'num_wings'])['num_legs'].count()
df.value_counts(subset=['num_legs', 'num_wings'], sort=False)
#640 µs ± 28.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
#568 µs ± 6.88 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
https://stackoverflow.com/questions/40950310/strip-trim-all-strings-of-a-dataframe#40950485
import pandas as pd
def trim_all_columns(df):
"""
Trim whitespace from ends of each value across all series in dataframe
"""
trim_strings = lambda x: x.strip() if isinstance(x, str) else x
return df.applymap(trim_strings)
# simple example of trimming whitespace from data elements
df = pd.DataFrame([[' a ', 10], [' c ', 5]])
df = trim_all_columns(df)
print(df)
Strip white space at beginning and end of string only.
df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
Strip white space between words greater than 2x whitespaces. On every column of type str in dataframe.
import pandas as pd
import re
def substitute_text_pattern(pat, replacement, text):
text = re.sub(pat, replacement, text)
return str(text)
df = df.applymap(lambda x: substitute_text_pattern(r'\s{2,}', '', x) if isinstance(x, str) else x)
Titanic Kaggle Competition.
Fill NaN values in age column for each gender based on median age for each value.
df.loc[(df['Age'].isnull()) & (df['Sex'] == 'female'), 'Age'] = df[df['Sex'] == 'female']['Age'].median()
df.loc[(df['Age'].isnull()) & (df['Sex'] == 'male'), 'Age'] = df[df['Sex'] == 'male']['Age'].median()
df['Age'] = df['Age'].astype(int)
https://stackoverflow.com/questions/28135436/concatenate-rows-of-two-dataframes-in-pandas
import pandas as pd
data = {'a': 1, 'b': 2}
pd.Series(data).to_frame()
pd.Series(data).to_frame('ColumnName')
https://stackoverflow.com/questions/18837262/convert-python-dict-into-a-dataframe
dict_ = {'key 1': 'value 1', 'key 2': 'value 2', 'key 3': 'value 3'}
pd.DataFrame([dict_])
key 1 key 2 key 3
0 value 1 value 2 value 3
pd.set_option('display.width', 270)
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_colwidth', 160)
pd.set_option('display.max_rows', 260)
https://www.journaldev.com/33472/pandas-dataframe-from_dict-convert-dictionary-to-dataframe
https://stackoverflow.com/questions/19377969/combine-two-columns-of-text-in-dataframe-in-pandas-python
https://datatofish.com/concatenate-values-python/
df = pd.DataFrame({'Year': ['2014', '2015'], 'quarter': ['q1', 'q2']})
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
https://kanoki.org/2020/04/14/resample-and-interpolate-time-series-data/#upsample
Definition:
df2.set_index('date').resample('3s').mean()
df2.set_index('date').resample('H').pipe(lambda x: x.max() - x.min())
# or
df2.set_index('date').resample('H').apply(lambda x: x.max() - x.min())
With interpolate:
Use interpolate function to fill NaN rows created after resampling using different methods like:
linear interpolation is a method of curve fitting using linear polynomials to construct new data points within the range of a discrete set of known data points
def upsample_df(df: pd.DataFrame, freq: str) -> pd.DataFrame:
df = df.resample(freq, convention = 'start').pad()
# df = df.resample(rule='0.5S').mean().interpolate(method='linear')
# interpolated = upsampled.interpolate(method='linear')
#interpolated = upsampled.interpolate(method='spline', order=2)
return df
With offset.
df2.set_index('date').resample('H',loffset='10s').mean()
import pandas as pd
df = pd.DataFrame(list(tweets.find()))
# or
from pandas import DataFrame
df = DataFrame(list(db.collection_name.find({}))
# or
from pandas.io.json import json_normalize
datapoints = list(db.collection_name.find({})
df = json_normalize(datapoints)
df.head()
With PyMongoArrow
https://github.com/mongodb-developer/pymongoarrow-sample-code/blob/main/PyMongoArrow Demo.ipynb