Must Know Classes Functions
Collections Itertools Functools
Conditional For-Loop Try-Except Design Ipython
Numpy Pandas Matplotlib (Pyplot)
List & Dict & Set Comprehensions
[(i , j ) for i in range (3 ) for j in range (3 ) if i > j ]
# [(1, 0), (2, 0), (2, 1)]
li = [1 , 2 , 3 ]
li = [* map (lambda x : x * 10 , li )]
#li = [10, 20, 30]
num1 = [100 , 1 , 20 ]
num2 = [19 , 4 , 94 ]
num3 = [40 , 6 , 30 ]
[* map (lambda x , y , z : max (x , y , z ), num1 , num2 , num3 )]
# [100, 6, 94]
names = ['Liam' , 'Olivia' , 'Noah' , 'Emma' , 'Oliver' , 'Ava' ]
choice = filter (lambda x : x .startswith ('O' ), names )
print (* choice , sep = ', ' ) # Olivia, Oliver
a = [1 , 2 , 3 ]
b = [4 , 5 , 6 ]
c = [* zip (a , b )] # [(1, 4), (2, 5), (3, 6)]
a , b = zip (* c ) # a=(1, 2, 3), b=(4, 5, 6)
Defining Functions with *arg and **kwarg
def example (a , * arg , b = 0 , ** kwarg ):
print (a ) # 1
print (arg ) # (2, 3)
print (b ) # 1
print (kwarg ) # {'x': 'a', 'y': [1, 2, 3]}
example (1 , 2 , 3 , b = 1 , x = 'a' , y = [1 , 2 , 3 ])
Calling Functions with *arg and **kwarg
def func (greet , time , name ):
print (greet , time , name )
func (* ["Good" , "Morning" ], ** {"name" : "Jay" })
# Good Morning Jay
a , b , * _ = [1 , 2 , 3 , 4 , 5 ]
# 1, 2, [3, 4, 5]
first , * amid , last = map (lambda x : x ** 2 , range (1 , 10000 ))
first # 1
last # 99980001
sales = [("Pencil" , 0.22 , 1500 ), ("Notebook" , 1.30 , 550 )]
for product , * _ in sales :
print (product )
# Pencil, Notebook
def compute (i ):
return i , i ** 2 , i ** 3 , i ** 4 , i ** 5
num , power , cube , * _ = compute (3 )
power # 9
cube # 27
number = {"one" : 1 , "two" : 2 }
letter = {"a" : "A" , "b" : "B" }
combine = {** number , ** letter }
combine # {'one': 1, 'two': 2, 'a': 'A', 'b': 'B'}
Generator (map, filter, zip)
def square_it (value ):
for i in range (value ):
yield i ** 2
li = square_it (10_000_000 )
[i for i in li if i < 50 ] # [0, 1, 4, 9, 16, 25, 36, 49]
def count_decorator (count ): # new decorator with argument
def decorator (orig_func ):
def wrapper (* args , ** kwargs ):
print (f"func name: { orig_func .__name__ } " )
print (f"func args: { args } , { kwargs } " )
for _ in range (count ): # use the argument
orig_func (* args , ** kwargs )
return wrapper
return decorator # return the original decorator
@count_decorator (2 )
def greet (msg ):
print (msg )
greet ("hello" )
# func name: greet
# func args: ('hello',), {}
# hello
# hello
@contextmanager
def enterFolder (folderName ):
home = os .getcwd ()
os .chdir (folderName )
yield
os .chdir (home )
with enterFolder ('folder1' ), open ('example1.txt' , 'w' ) as f :
f .write ('file1' )
class BinaryInt (str ):
def __new__ (cls , val ):
return str .__new__ (cls , f"{ val : b} " )
def __add__ (self , val ):
val += int (self , 2 )
return f"{ val :b} "
a = BinaryInt (2 )
print (a ) # 10
print (a + 4 ) # 110
class Meta (type ):
def __new__ (mtcls , name , bases , attrs ):
if name != "Base" and "must_to_do" not in attrs :
raise TypeError ("Bad Class: must_to_do() is needed" )
return super ().__new__ (mtcls , name , bases , attrs )
class Base (metaclass = Meta ):
def server_func (self ):
return self .must_to_do ()
class Derived (Base ):
...
# TypeError: Bad Class: must_to_do() is needed
Threading & Multiprocessing
import concurrent .futures
with concurrent .futures .ThreadPoolExecutor (max_workers = 5 ) as executor :
futures = [executor .submit (load_url , url , 60 ) for url in URLS ]
for future in concurrent .futures .as_completed (futures ):
result = future .result ()
print (len (result ))
with concurrent .futures .ProcessPoolExecutor () as executor :
results = executor .map (load_url , URLS , [60 ] * len (URLS ), chunksize = 4 )
for result in results :
print (len (result ))
class Person :
def __init__ (self , name ):
self .name = name
def say (self ):
return f"I'm { self .name } "
p = Person ("Jay" )
p .say () == Person .say (p ) # True
variables (class & instance)
class Employee :
num_emp = 0 # Class variable
def __init__ (self , pay ):
self .pay = pay # Instance variable
Employee .num_emp += 1
e1 = Employee (100 )
e2 = Employee (200 )
e1 .num_emp # 2
Employee .num_emp # 2
e1 .pay # 100
Employee .pay # AttributeError: type object 'Employee' has no attribute 'pay'
method vs. classmethod vs. staticmethod
class Person :
def __init__ (self , name , age ):
self .name = name
self .age = age
@staticmethod
def splitPersonString (string , split_sign = "-" ):
return string .split (split_sign )
@classmethod
def fromString (cls , cls_str ):
return cls (* cls .splitPersonString (cls_str , ", " ))
p1 = Person .fromString ("Jay, 99" )
p1 .name # Jay
p1 .age # 99
_ (private) vs. __ (name mangling)
class Dog :
_weight = 5 # private variable
def __bark (self ): # name mangling fucntion
print ("bark" )
dog = Dog ()
dog ._weight # 5
dog .__bark () # AttributeError: 'Dog' object has no attribute '__bark'
dog ._Dog__bark () # bark
@property (getter, setter)
class User :
def __init__ (self , first_name , last_name , password ):
self .first_name = first_name
self .last_name = last_name
self .password = password
@property
def fullname (self ):
return f"{ self .first_name } { self .last_name } "
@property
def password (self ):
raise AttributeError ("password is not readable." )
@password .setter
def password (self , passord ):
from hashlib import md5
self .password_hash = md5 (b"{password}" ).hexdigest ()
user = User ("Mimi" , "Wang" , "0000" )
user .fullname # Mimi Wang
user .password_hash # 7fbccc9c3a9a5afef65563cd00404c1416
user .password # Attribute Error: password is not readable.
LEGB (local, enclosing, global, builtins)
min ([1 , 2 , 31 ]) # builtins min
min = "global min"
def outer ():
# we can do "global min" here to change global
min = "enclosing min"
def inner ():
# we can do "nonlocal min" here to change enclosing
min = "local min"
from abc import ABC , abstractmethod
class Base (ABC , object ):
@property
@abstractmethod
def foo (self ):
...
@abstractmethod
def do (self ):
...
from dataclasses import InitVar , dataclass , field
from typing import List
@dataclass
class InventoryItem :
name : str
unit_price : float = field (default = 0.0 )
quantity_on_hand : int = field (default = 0 , repr = False )
parts : List [str ] = field (default_factory = list )
parts_number : InitVar [int ] = 0
def __post_init__ (self , parts_number ):
self .parts .extend ([f"part{ i } " for i in range (1 , parts_number + 1 )])
item = InventoryItem ("product" , parts_number = 2 )
# InventoryItem (name = 'product', unit_price=0.0, parts=['part1', 'part2'])
Classes in Dynamic Language
def getClass (x ):
if x == 1 :
for i in range (11 ):
class Example :
a = i
return Example
cls = getClass (1 )
cls .b = "123"
print (cls .a , cls .b ) # 10 123
def add_with_b (b ):
def add (a ):
return a + b
return add
add4 = add_with_b (4 )
add4 (3 ) # 7
add4 (7 ) # 11
class Cat :
def __repr__ (self ):
return f"({ self .name } : { self .age } )"
listOfCats = []
attrs = [{"name" : "meow1" , "age" : 5 }, {"name" : "meow2" , "age" : 10 }]
for attr in attrs :
cat = Cat ()
for key , val in attr .items ():
setattr (cat , key , val )
listOfCats .append (cat )
print (listOfCats )
# [(meow1: 5), (meow2: 10)]
Functions in Dynamic Language
for i in range (100 ):
def say ():
print (i )
def returnFunc (a ):
if a < 100 :
def mul (b ):
print (a * b )
return mul
else :
def add (b ):
print (a + b )
return add
from collections import defaultdict
d = defaultdict (list )
d ["a" ] = [1 , 2 , 3 ]
d ["b" ].append (4 )
d ["c" ].extend ([5 , 6 ])
# defaultdict(<class 'list'>, {'a': [1, 2, 3], 'b': [4], 'c': [5, 6]})
from collections import OrderedDict
location = ["C" , "B" , "A" ]
population = [32 , 46 , 12 ]
d = OrderedDict ({l : p for l , p in zip (location , population )})
# OrderedDict([('C', 32), ('B', 46), ('A', 12)])
d ["D" ] = 44
# OrderedDict([('C', 32), ('B', 46), ('A', 12), ('D', 44)])
d .popitem (last = False )
# OrderedDict([('B', 46), ('A', 12), ('D', 44)])
d .move_to_end ("D" , last = False )
# OrderedDict ([( 'D', 44), ('B', 46), ('A', 12)])
from collections import Counter
c = Counter (cats = 4 , dogs = 8 )
# Counter({'dogs': 8, 'cats': 4})
c .update (birds = 10 )
# Counter({'birds': 10, 'dogs': 8, 'cats': 4})
c = c - Counter ({"birds" : 5 })
# Counter({'dogs': 8, 'birds': 5, 'cats': 4})
c .most_common (2 )
# [('dogs', 8), ('birds', 5)]
from collections import namedtuple
Dog = namedtuple ("Dog" , "name, age" )
d1 = Dog ("funny" , 4 )
features = ["happy" , 3 ]
d2 = Dog ._make (features )
# Dog(name='happy', age=3)
d2 ._asdict ()
# OrderedDict([('name', 'happy'), ('age', 3)])
from collections import deque
li = [40 , 30 , 50 , 46 , 39 , 44 ]
d = deque (li [:2 ])
# Let 's compute the moving average with range=3
d .appendleft (0 )
s = sum (d )
for elem in li [2 :]:
s += elem - d .popleft ()
d .append (elem )
print (s / 3 )
# 40, 42, 45, 43
from itertools import count
gen = count (2.5 , 0.5 )
for x in gen :
print (x )
# 2.5, 3.0, 3.5, 4.0, ... non-stop
from itertools import cycle
gen = cycle ([1 , 2 , 3 ])
for x in gen :
print (x )
# 1, 2, 3, 1, 2, ... non-stop
from itertools import repeat
class Cat :
...
gen = repeat (Cat (), 2 )
for cat in gen :
print (cat )
# <__main__.Cat object at 0x0000019AC1C5D348>
# <__main__.Cat object at 0x0000019AC1C5D348>
Iterators terminating on the shortest input sequence
import operator
from itertools import accumulate
gen = accumulate ([1 , 2 , 3 , 4 ])
list (gen ) # [1, 3, 6, 10]
gen = accumulate ([1 , 2 , 3 , 4 ], func = operator .mul )
list (gen ) # [1, 2, 6, 24]
from itertools import chain
gen = chain ([1 , 2 ], [3 , 4 ])
list (gen ) # [1, 2, 3, 4]
gen = chain ("AB" , "CD" )
list (gen ) # [A, B, C, D]
from itertools import compress
gen = compress ([1 , 2 , 3 ], [1 , 0 , 1 ])
gen = compress ([1 , 2 , 3 ], [True , False , True ]) # same
list (gen ) # [1, 3]
from itertools import filterfalse
gen = filterfalse (lambda x : x % 2 == 0 , [1 , 2 , 3 ])
list (gen ) # [1, 3]
from itertools import groupby
gen = groupby ("AABBCCCAA" ) # default func = lambda x: x
for k , g in gen :
print (k , list (g ))
# A [A, A]
# B [B, B]
# C [C, C, C]
# A [A, A]
gen = groupby ([1 , 2 , 3 , 4 ], lambda x : x // 3 )
for k , g in gen :
print (k , list (g ))
# 0 [1, 2]
# 1 [3, 4]
gen = groupby ([("A" , 100 ), ("B" , 200 ), ("C" , 600 )], lambda x : x [1 ] > 500 )
for k , g in gen :
print (k , list (g ))
# False [(A, 100), (B, 200)]
# True [(C, 600)]
gen = islice ([1 , 2 , 3 ], 2 ) # equals to A[:2]
list (gen ) # [1, 2]
gen = islice ("ABCD" , 2 , 4 ) # equals to A[2:4]
list (gen ) # [C, D]
gen = islice ("ABCD" , 0 , None , 2 ) # equals to A[::2]
list (gen ) # [A, C]
from itertools import starmap
# with only one argument
gen = starmap (lambda x : x .lower (), "ABCD" )
list (gen ) # [a, b, c, d]
# with 2 arguments
gen = starmap (lambda x , y : x + y , [(1 , 2 ), (3 , 4 )])
list (gen ) # [3, 7]
# with different size of arugments
gen = starmap (lambda * keys : sum (keys ) / len (keys ), [[3 , 8 , 3 ], [4 , 2 ]])
list (gen ) # [4.6666667, 3.0]
from itertools import takewhile
gen = takewhile (lambda x : x < 2 , [1 , 2 , 3 , 2 , 1 ])
list (gen ) # [1]
gen = takewhile (lambda x : x .isupper (), "ABCdefgHIJ" )
list (gen ) # [A, B, C]
gen = dropwhile (lambda x : x < 2 , [1 , 2 , 3 , 2 , 1 ])
list (gen ) # [2, 3, 2, 1]
gen = dropwhile (lambda x : x .isupper (), "ABCdefgHIJ" )
list (gen ) # [d, e, f, g, H, I, J]
from itertools import zip_longest
gen = zip_longest ("ABC" , ("X" , "Y" ))
list (gen ) # [('A', 'X'), ('B', 'Y'), ('C', None)]
gen = zip_longest ("ABC" , [1 , 2 ], fillvalue = - 1 )
list (gen ) # [('A', 1), ('B', 2), ('C', -1)]
from itertools import product
gen = product ("AB" , "CD" )
list (gen ) # [AC, AD, BC, BD]
gen = product ("AB" , repeat = 2 )
list (gen ) # [AA, AB, BA, BB]
gen = product ("AB" , "CD" , repeat = 2 )
list (gen )
# [ACAC, ACAD, ACBC, ACBD,
# ADAC, ADAD, ADBC, ADBD,
# BCAC, BCAD, BCBC, BCBD,
# BDAC, BDAD, BDBC, BDBD]
gen = permutations ("ABC" ) # same as r=3
list (gen ) # [ABC, ACB, BAC, BCA, CAB, CBA]
gen = permutations ("ABC" , r = 2 )
list (gen ) # [AB, AC, BA, BC, CA, CB]
gen = permutations ("ABC" , r = 1 )
list (gen ) # [A, B, C]
gen = combinations ("ABC" , 1 )
list (gen )
# [A, B, C]
gen = combinations ("ABC" , 2 )
list (gen )
# [AB, AC, BC]
gen = combinations ("ABC" , 3 )
list (gen )
# [ABC]
combinations_with_replacement
gen = combinations_with_replacement ("ABC" , 1 )
list (gen )
# [A, B, C]
gen = combinations_with_replacement ("ABC" , 2 )
list (gen )
# [AA, AB, AC,
# BB, BC,
# CC]
gen = combinations_with_replacement ("ABC" , 3 )
list (gen )
# [AAA, AAB, AAC, ABB, ABC, ACC,
# BBB, BBC, BCC,
# CCC]
from functools import reduce
reduce (lambda x , y : x - y , [1 , 2 , 3 , 4 , 5 ], 100 ) # 85
first_name = "Kain"
last_name = "Mccarthy"
print (f"Hi, I'm { first_name } { last_name } ." ) # Hi, I'm Kain Mccarthy.
pi = 3.14159265359
print (f"{ pi :.2f} " ) # 3.14
d = {"name" : "Shelly" }
print (f"She is { d ['name' ]} " ) # She is Shelly
i = 1000000
print (f"{ i :,} " ) # 1,000,000
# Ref:
# * https://youtu.be/nghuHvKLhJA
# * https://blog.louie.lu/2017/08/08/outdate-python-string-format-and-fstring/
a = 100_000_000
b = 10_000_000
c = 1_0_0
print (f"{ a + b + c :,} " ) # 110,000,100
# Ref:
# * https://youtu.be/C-gEQdGVXbk&t=140
long_list = [i for i in range (100_000_000 )]
long_set = set (long_list )
% % time
100_000_000 in long_list
# False
# Wall time: 1.26 s
% % time
100_000_000 in long_set
# False
# Wall time: 0 ns
# Ref:
# * https://stackoverflow.com/questions/2831212/python-sets-vs-lists/17945009
# * https://youtu.be/r3R3h5ly_8g?t=1010
a , b = 1 , 2
a # 1
b # 2
a , b = b , a
a # 2
b # 1
# Ref:
# * https://youtu.be/VBokjWj_cEA?list=LL&t=445
if x < 1 :
x += 1
else :
x -= 1
# equivalent to:
x = (x + 1 ) if (x < 1 ) else (x - 1 )
# Ref:
# * https://www.youtube.com/watch?v=C-gEQdGVXbk&t=34s
arr = ["a" , "b" , "c" ]
for index , element in enumerate (arr ):
print (index , element )
# 0 a
# 1 b
# 2 c
for index , element in enumerate (arr , start = 3 ):
print (index , element )
# 3 a
# 4 b
# 5 c
# Ref
# * https://youtu.be/VBokjWj_cEA?list=LL&t=190
for text in "to be or not to be" .split ():
if text .strip ().startswith ("o" ):
print (f"Found it! `{ text } `" )
break
else :
print ("Not found" )
# Found it! `or`
# Ref:
# * https://www.youtube.com/watch?v=Dh-0lAyc3Bc
try :
print (1 / 1 )
except Exception as e :
print (e )
else :
print ("Safe" ) # executed when except didn't happen
finally :
print ("Done" ) # Always executed
# 1.0
# Safe
# Done
# Ref:
# * https://youtu.be/VBokjWj_cEA?list=LL&t=1331
def func (a : str , b : int = 3 ) -> str :
return a * b
func .__annotations__ # {'a': <class 'str'>, 'b': <class 'int'>, 'return': <class 'str'>}
func ("hi" ) # hihihi
func ("hi" , 5 ) # hihihihihi
def func (a : "str longer than 5" , b : 1 + 2 = 3 ) -> "str longer b times" :
return a * b
func .__annotations__ # {'a': 'str longer than 5', 'b': 3, 'return': 'str longer b times'}
func ("hi" ) # hihihi
func ("ohayou" , 2 ) # ohayouohayou
from typing import Any , Dict , Iterable , List , Union
def func (a : List [int ], b : Union [str , int ], c : Dict [str , int ], d : Iterable , e : Any ):
print (len (a ))
print (f"{ b } can be str or int." )
print (f"{ c ['something' ]} will return int." )
for i in d :
print (i )
print (f"{ type (e )} can be any type." )
# Ref:
# * https://myapollo.com.tw/zh-tw/python-typing-module/
# Style 1
def my_abstract_method (self ):
pass
# Style 2
def my_abstract_method (self ):
...
# Style 3
def my_abstract_method (self ):
"""
This function is ...
"""
# Ref:
# * https://stackoverflow.com/questions/55274977/when-is-the-usage-of-the-python-ellipsis-to-be-preferred-over-pass
# * https://stackoverflow.com/questions/772124/what-does-the-ellipsis-object-do
VSCode Python Interactive window
#%%
1 + 1
# 2
# Ref:
# * https://code.visualstudio.com/docs/python/jupyter-support-py
% time sleep (0.3 )
# Wall time: 310 ms
% timeit sleep (0.3 )
# 311 ms ± 2.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
% % time
for i in range (10 ):
sleep (0.1 )
# Wall time: 1.09 s
% % timeit
for i in range (10 ):
sleep (0.1 )
# 1.09 s ± 2.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
!pip install - U memory_profiler
% load_ext memory_profiler
% memit [i for i in range (1000 )]
# peak memory: 51.31 MiB, increment: 0.36 MiB
% % memit
l = []
for x in range (10000 ):
l .append (x * 2 )
# peak memory: 52.76 MiB, increment: 0.70 MiB
sub_folder = Path ("subfolder/subfolder" )
sub_folder .mkdir (parents = True , exist_ok = True )
file_ = sub_folder / Path ("test.txt" )
file_ .touch ()
file_ .write_text ("Hello" )
file_ .read_text ()
file_ .unlink ()
Path ("subfolder/subfolder" ).rmdir ()
np .array ([[1 , 2 ], [3 , 4 ], [5 , 6 ]]) # create from list
np .zeros ((3 , 3 )) # create filled with 0's
np .ones ((2 , 4 , 4 )) # create filled with 1's
np .empty ((5 , 2 )) # create with speed
np .arange (2 , 10 , 3 ) # create array from range (start, end, step_size)
np .linspace (5 , 50 , 20 ) # create a linear space (start, end, num_elements)
# create from random generator
rng = np .random .default_rng (seed = 42 )
rng .random ((2 , 4 ))
rng .normal (3 , 2.5 , size = (2 , 4 )) # sample from N(3, 6.25)
rng .integers (low = 2 , high = 10 , size = (10 , 2 )) # random integer matrix
np .sort (a , axis = None )
np .sort (a , axis = - 1 )[::- 1 ]
a .sort ()
a [::- 1 ].sort ()
np .concatenate ((a , b ), axis = None )
np .concatenate ((a , b ), axis = 2 )
a = np .arange (5 ) # [0, 1, 2, 3, 4]
b = np .ones (5 , dtype = int ) # [1, 1, 1, 1, 1]
a + b # [1 2 3 4 5]
a - b # [-1 0 1 2 3]
a ^ 2 # [ 0 1 4 9 16]
a * 10 # [ 0 10 20 30 40]
a > 2 # [False False False True True]
np .sqrt (a ) # [0. , 1. , 1.41421356, 1.73205081, 2. ]
a * b # [0 1 2 3 4]
a @b # 10
All (None) Column-wise (0), Row-wise (1)
A = np .random .default_rng (42 ).random ((2 , 4 ))
# [[0.77395605, 0.43887844, 0.85859792, 0.69736803],
# [0.09417735, 0.97562235, 0.7611397 , 0.78606431]])
A .max () # 0.97562235
A .max (axis = 0 ) # [0.77395605, 0.97562235, 0.85859792, 0.78606431]
A .max (axis = 1 ) # [0.85859792, 0.97562235]
A .mean () # 0.6732255180088094
A .mean (axis = 0 ) # [0.4340667 , 0.7072504 , 0.80986881, 0.74171617]
A .mean (axis = 1 ) # [0.69220011, 0.65425093]
# Index and slicing arrays
x [1 , 3 ] == x [1 ][3 ]
y [1 :5 :2 , ::3 ]
# Indexing arrays
x [np .array ([0 , 1 , 2 , - 1 , - 2 ])]
y [np .array ([1 , 2 , 3 ]), 1 :4 :2 ]
y [np .array ([1 , 2 ]), np .array ([- 1 , - 1 ])]
# Masking arrays
x [x > 5 ]
x [(x % 2 == 0 ) | (x > 7 )]
y [[True ]* 3 + [False ] + [True ] + [False ], 2 ::2 ]
# Ellipsis syntax
x [- 1 , ..., 3 ] # same as x[-1, :, 3]
x [:3 , ...] # same as x[0:3, :, :] and x[0:3] and x[:3]
x [::2 , ..., np .array ([0 , 2 ])] # same as x[0:5:2, :, np.array([0, 2])]
A = np .array ([[[1 , 2 , 3 ], [4 , 5 , 6 ]], [[4 , 6 , 8 ], [2 , 1 , 6 ]]])
A .shape # (2, 2, 3)
A = A .reshape (3 , 2 , 2 ) # (3, 2, 2)
A = A [np .newaxis , ...] # (1, 3, 2, 2)
A = np .expand_dims (A , axis = 4 ) # (1, 3, 2, 2, 1)
A = A .flatten () # (12,)
A = A .reshape (2 , - 1 , 2 ) # (2, 3, 2)
# shallow copy: values will change on every variable
a = np .arange (10 ).reshape (5 , 2 )
b = a .view ()
c = a .reshape (- 1 )
d = a [:3 , :1 ]
# deep copy: copy and create an entirely new array
a = np .arange (10000000 )
b = a [:100 ].copy ()
del a
# scalar broadcasting
a = np .array ([1 , 2 , 3 ])
a * 3 # [3, 6, 9]
# general broadcasting
a = np .ones ( (8 , 1 , 6 , 1 ))
b = np .zeros ( (7 , 1 , 5 ))
(a * b ).shape # 8, 7, 6, 5
# outer product
a = np .arange (4 )[:, np .newaxis ] # (4, 1)
b = np .array ([1 , 2 , 3 ]) # (3,)
a + b # (4, 3)
# [0] + [1, 2, 3] = [1 2 3]
# [1] [2 3 4]
# [2] [3 4 5]
# [3] [4 5 6]
# Create Series
pd .Series ([1 , 2 , 3 , 4 , 5 ])
pd .Series (np .arange (1 , 6 ), index = list ("abcde" ))
pd .Series ({"a" : 100 , "b" : 50 , "c" : 120 })
pd .Series ("hi" , index = list ("12345" ))
# Create DataFrame
pd .DataFrame ({
"col_1" : [1 , 2 , 3 , 4 , 5 ],
"col_2" : np .arange (1 , 6 ),
"col_3" : pd .Series (np .arange (1 , 7 ), index = list ("abc123" )),
}, index = list ("abcde" ))
pd .DataFrame (
[
{"a" : 1 , "b" : 2 },
{"b" : 10 , "c" : 5 },
{"a" : 55 , "b" : 489 , "c" : 32 , "d" : 590 },
],
index = ["first" , "second" , "third" ],
columns = list ("ab" )
)
pd .DataFrame (
np .arange (10 ).reshape (2 , 5 ),
# [[0,1,2,3,4], [5,6,7,8,9]]
index = pd .date_range ("20200101" , periods = 2 ),
columns = list ("abcde" ))
# Viewing
df .head (2 )
df .tail (3 )
df .index
df .columns
df .to_numpy ()
df .sort_index ()
df .sort_values ("col_name" )
Single Column
Multiple Columns
Continuous Columns
All Columns
Single Row
df.loc[row, column]
or df.at[row, column]
df.loc[row, [column, column]]
df.loc[row, column:column]
df.loc[row]
Multiple Rows
df.loc[[row, row], column]
df.loc[[row, row], [column, column]]
df.loc[[row, row], column:column]
df.loc[[row, row]]
Continuous Rows
df.loc[row:row, column]
df.loc[row:row, [column, column]]
df.loc[row:row, column:column]
df[row:row]
All Rows
df[column]
df[[column, column]]
or df.loc[:, [column, column]]
df.loc[:, column:column]
df
df ["col1" ]
df [["col1" , "col2" ]]
df ["row1" :"row5" ]
df .loc ["row1" , "col1" ] # df.iloc[0, 0]
df .at ["row1" , "col1" ] # df.iat[0, 0]
df .loc ["row1" , ["col1" , "col2" ]] # df.iloc[0, [0, 1]]
df .loc ["row1" , "col1" :"col5" ] # df.iloc[0, 0:4]
df .loc [["row1" , "row2" ]] # df.iloc[[0, 1]]
df .loc ["row1" :"row5" , "col1" ] # df.iloc[0:4, 0]
df [(df ["col1" ] > 18 )]
df [(df > 6 ) & (df < 25 )]
df [df ["col1" ].isin ([10 , 15 , 0 ])]
df.iloc
is same as df.loc
but using position.
df.iat
is same as df.at
but using position.
Details 🔥
Setting, Deleting, and Handling
# Modify columns
df ["col1" ] += 10
df .loc [:, "col1" ] = "bar"
df .loc [:, ["col1" , "col3" ]] = np .arange (12 ).reshape (6 , 2 )
# Modify single element
df .loc ["row1" , "col1" ] = 0
df .iloc [0 , 0 ] = 1
# Modify by boolean indexing
df [df < 100 ] = - df
# Append
df ["total" ] = df .sum (axis = 1 ).to_numpy ()
df ["gt" ] = df ["total" ] > 50000
df ["foo" ] = "bar"
# Insert
df .insert (0 , "col0" , df ["col2" ][:2 ]) # col_index, col_name, values
# Delete column
del df ["total" ]
df .drop (columns = ["foo" ], inplace = True ) # same as `df.drop(["foo"], axis=1)`
gt50000 = df .pop ("gt50000" )
# Delete row
df .drop (["e" , "d" ], inplace = True )
# Handle NaN
miss_df .dropna (how = 'any' )
miss_df .fillna (value = 10000000 )
Operations and Apply Functions
# Arithmetic
df + df2
df - df .iloc [0 ]
1 / df
# Numpy
np .sqrt (df )
np .max (df , axis = 1 )
# Built-in
df .mean ()
df .max (axis = 1 )
# Apply
df .apply (np .cumsum , axis = 1 )
df .apply (lambda x : x .sum () / x .size ) # x means df
# Series
s .value_counts ()
s .str .upper ()
s .str .split ("-" ).str .get (0 )
# Concat rows
pd .concat ([df [:3 ], df .iloc [7 :, :2 ]])
# Merge two DataFrame
pd .merge (df , df2 , on = "name" , how = "right" )
Grouping and Categorical Data Type
# Groupby
df .groupby ("col_A" ).sum ()
df .groupby (["col_A" , "col_B" ]).max ()
# Categorical - discrete
df ["grade" ] = df ["grade" ].astype ("category" )
df ["grade" ].cat .categories = ["Bad" , "Good" , "Excellent" ]
df .sort_values (by = "grade" )
df .groupby ("grade" ).size ()
# Categorical - continuous
df ["grade-labels" ] = pd .cut (df ["score" ], bins = range (0 , 120 , 20 ), labels = list ("EDCBA" ))
# Rename Columns
df .columns = ["col_one" , "col_two" ]
df = df .add_prefix ("Xx_" )
df = df .add_suffix ("_xX" )
df .columns = df .columns .str .replace ("Xx" , "Oo" )
df .columns = df .columns .str .replace ("xX" , "oO" )
# Reverse Row or Column Order
df .loc [::- 1 ].reset_index (drop = True ) # reverse rows
df .loc [:, ::- 1 ] # reverse columns
# Split DataFrame into 2 random subsets
sub1 = df .sample (frac = 0.75 , random_state = 42 )
sub2 = df .drop (sub1 .index )
sub1 .index = sub1 .index .sort_values ()
sub2 .index = sub2 .index .sort_values ()
# Filter by Category (or Largest Category)
df [df .genre .isin (["A" , "D" ])]
df [~ df .genre .isin (["A" , "D" ])]
df [df .genre .isin (df .genre .value_counts ().nlargest (1 ).index )]
# Split String into Multiple Columns
df [["first" , "last" ]] = df ["name" ].str .split (' ' , expand = True )
df ["city" ] = df ["location" ].str .split (", " , expand = True )[0 ]
# Change Display Options (Not Change Data)
pd .set_option ("display.float_format" , "${:.2f}" .format )
pd .reset_option ("display.float_format" )
# Style a DataFrame
style = {"Date" : "{:%Y/%m/%d}" , "Value" : "${:d}" , "Volume" : "{:,}" }
df .style .format (style ) \
.hide_index () \
.highlight_max ("Value" , color = "red" ) \
.highlight_min ("Value" , color = "green" ) \
.bar ("Area" , color = "orange" , align = "zero" ) \
.background_gradient (subset = "Volume" , cmap = "Greens" ) \
.set_caption ("Random Chart" )
import matplotlib .pyplot as plt
# with this magic function, we can skip `plt.show()`
% matplotlib inline
plt .plot (np .sin (np .linspace (0 , 10 , 100 )), "*-b" , lw = 2 , markersize = 5 , label = "sin(x)" )
plt .plot (np .log (np .arange (100 )), c = "g" , ls = "--" , marker = "." , lw = 2 , markersize = 5 , label = "log(x)" )
plt .xlabel ("X here" )
plt .ylabel ("Y here" )
plt .title ("sin(x) and log(x)" )
plt .grid ()
plt .legend ()
plt .text (x = 70 , y = - 1 , s = "hahahaha" )
plt .annotate ("wow \n max" , xy = (16 , 1 ), xytext = (40 , 0.9 ), arrowprops = {"facecolor" : "orange" , "shrink" : 0.05 })
plt .annotate ("wow \n max again" , xy = (78 , 1 ), xytext = (95 , 0.9 ), arrowprops = {"facecolor" : "red" , "shrink" : 0.05 })
Multiple Figures and Axes
# Object-oriented style
fig1 , ax = plt .subplots ()
ax .plot (...)
fig2 , axs = plt .subplots (2 , 1 )
axs [0 ].plot (...)
axs [1 ].plot (...)
# Pyplot style
plt .figure (1 )
plt .title ("Figure 1" )
plt .figure (2 )
plt .subplot (311 )
plt .title ("Figure 2" )
plt .subplot (323 )
plt .subplot (324 )
plt .subplot (337 )
plt .subplot (338 )
plt .subplot (339 )
Line Plots and Filling Area
years = [1.1 , 1.3 , 1.5 , 2.0 , 2.2 , ...]
salary = [39343.00 , 46205.00 , 37731.00 , 43525.00 , 39891.00 , ...]
salary_mean = np .mean (salary )
# Line Plots
plt .plot (years ,
salary ,
marker = "o" ,
markersize = 5 ,
lw = 2 ,
ls = "-" ,
)
# Filling Areas
plt .fill_between (years ,
salary ,
salary_mean ,
where = (salary > salary_mean ),
alpha = .4 ,
color = "green" ,
edgecolor = "black" ,
interpolate = True ,
label = "On Average"
)
import matplotlib .dates as mdates
dates = np .arange (np .datetime64 ("2021-01-01" ), np .datetime64 ("2021-01-22" ))
prices = np .random .default_rng (42 ).normal (500 , 30 , len (dates ))
plt .gca ().xaxis .set_major_formatter (mdates .DateFormatter ("%a, %d %m" ))
plt .gca ().xaxis .set_major_locator (mdates .DayLocator (interval = 7 ))
plt .gca ().xaxis .set_minor_locator (mdates .DayLocator ())
plt .plot_date (dates , prices ,
ls = "solid" ,
c = "orange" ,
marker = "^" ,
markersize = 10 )
plt .grid ()
plt .tight_layout ()
temperature = [14.2 , 16.4 , 11.9 , 15.2 , ...]
ice_cream_sales = [215 , 325 , 185 , 332 , ...]
colors = np .array (ice_cream_sales ) / np .linalg .norm (ice_cream_sales )
plt .scatter (temperature , ice_cream_sales ,
s = ice_cream_sales , # set the size according to the prices of the ice cream
c = colors , # set the colors according to the prices of the ice cream
cmap = "Greens" , # preferred color type
edgecolor = "black" , # the edge color of points
lw = 0.5 , # the edge width of points
alpha = .75 ,
)
plt .xlabel ("temperature" )
plt .ylabel ("ice cream price" )
plt .yscale ("log" ) # use log scale on y-axis to handle outliners
cbar = plt .colorbar ()
cbar .set_label ("Expensive" )
plt .tight_layout ()
# Bar Charts
ages = [25 , 26 , 27 , 28 , 29 , ...]
salary_all = [38496 , 42000 , 46752 , 49320 , 53200 , ...]
index = np .arange (len (ages ))
width = 0.25
plt .bar (index - width , salary_all , width = 0.25 , label = "All Devs" )
plt .bar (index , salary_py , width = 0.25 , label = "Python" )
plt .bar (index + width , salary_js , width = 0.25 , label = "JavaScript" )
plt .xticks (ticks = index , labels = ages )
plt .title ("Median Salary (USD) by Age" )
plt .xlabel ("Ages" )
plt .ylabel ("Median Salary (USD)" )
plt .legend ()
plt .tight_layout ()
# Horizontal Bar Charts
language = ['JavaScript' , 'HTML/CSS' , 'SQL' , 'Python' , ...]
popularity = [59219 , 55466 , 47544 , 36443 , ...]
plt .barh (language , popularity )
plt .title ("Most Popular Languages" )
plt .xlabel ("Number of People Who Use" )
plt .tight_layout ()
grade = ["A" , "B" , "C" , "D" , "E" ]
number = [10 , 18 , 23 , 8 , 5 ]
explode = [0.1 , 0 , 0 , 0 , 0 ]
plt .pie (number ,
labels = grade ,
shadow = True ,
autopct = "%1.1f%%" ,
pctdistance = 0.6 ,
startangle = 90 ,
explode = explode
)
plt .title ("Test Grade" )
plt .tight_layout ()
height_stats = np .random .default_rng (42 ).normal (160 , 15 , 1000 )
interval_bin = [120 , 130 , 140 , 150 , 160 , 170 , 180 , 190 , 200 ]
plt .hist (height_stats , bins = interval_bin ,
edgecolor = "black" , lw = 1 , density = True )
# Plot the probability density curve
import scipy .stats as ss
density = ss .kde .gaussian_kde (height_stats )
index = np .arange (120 , 200 )
plt .plot (index ,
density .evaluate (index ),
color = "pink" ,
lw = 3 ,
ls = "--" ,
label = "Probability Density" )
# Plot the mean line
plt .axvline (np .mean (height_stats ), c = "orange" , lw = 5 , label = "Height Mean" )
plt .legend ()
plt .title ("Height Stats" )
plt .xlabel ("Heights" )
plt .ylabel ("Probability Density" )
plt .tight_layout ()
years = [1950 , 1960 , 1970 , 1980 , 1990 , 2000 , 2010 , 2018 ]
population_by_continent = {
'africa' : [228 , 284 , 365 , 477 , 631 , 814 , 1044 , 1275 ],
'americas' : [340 , 425 , 519 , 619 , 727 , 840 , 943 , 1006 ],
'asia' : [1394 , 1686 , 2120 , 2625 , 3202 , 3714 , 4169 , 4560 ],
'europe' : [220 , 253 , 276 , 295 , 310 , 303 , 294 , 293 ],
'oceania' : [12 , 15 , 19 , 22 , 26 , 31 , 36 , 39 ],
}
y = population_by_continent .values ()
labels = population_by_continent .keys ()
colors = ["#96ceb4" , "#ffeead" , "#ff6f69" , "#ffcc5c" , "#88d8b0" ]
plt .style .use ("seaborn" )
plt .stackplot (years , y , labels = labels , colors = colors )
plt .legend (loc = "upper left" )
plt .title ("World Population" )
plt .xlabel ("Year" )
plt .ylabel ("Population (Millions)" )
plt .tight_layout ()
img = mpimg .imread ("https://www.catster.com/wp-content/uploads/1970/01/Am-ShortHair-breed_getty1140883355-768x513.png" )
plt .imshow (img )
# Applying pseudocolor schemes
plt .imshow (img [..., 0 ], cmap = "gray" )
plt .colorbar ()
# Flipping Photos Vertically or Horizontally
plt .imshow (img [::- 1 ]) # Reverse at the first axis == vertical flip
plt .imshow (img [:, ::- 1 ]) # Reverse at the second axis == horizontal flip
Styles, Colors, Colormaps
# Switch Style
plt .style .use ("seaborn-pastel" )
# Data
x = np .random .default_rng (42 ).integers (0 , 100 , 100 )
y = (2 * x + 1 ) * np .random .default_rng (43 ).normal (5 , 1 , 100 )
regr = sklearn .linear_model .LinearRegression ()
regr .fit (x [:, np .newaxis ], y [:, np .newaxis ])
regr_line = regr .predict (x [:, np .newaxis ])
# Plotting with fancy color and colormap
plt .scatter (x , y , c = y , alpha = 0.25 , cmap = "plasma" )
plt .plot (x , regr_line ,
color = "darkviolet" ,
alpha = 0.5 ,
lw = 5 , ls = "-" ,
label = "regression line" )
plt .title ("Linear Regression Test" )
plt .xlabel ("X" )
plt .ylabel ("y" )
plt .legend ()
plt .colorbar ()
x = np .array (range (1 , 5 ))
y = x ** 2
df = pd .DataFrame (zip (x , y ), columns = ["col_1" , "col_2" ])
# Plotting with data parameter
def plot ():
sns .lineplot (x = "col_1" , y = "col_2" , data = df )
# Seaborn Styles
sns .set_style ("white" )
# Scaling the plots
sns .set_context ("paper" , font_scale = 1.5 )
# Changing the figure Size
plt .figure (figsize = (8 , 4 )) # width, height
# Using Seaborn with Matplotlib
plt .subplot (211 )
plt .title ("Square X" )
plot ()
# Seaborn Styles Context Manager
with sns .axes_style ("darkgrid" ):
plt .subplot (212 )
plot ()
plt .tight_layout ()
# Sequential Palette
palette = sns .color_palette ("YlGn" )
sns .palplot (palette )
plt .title ("YlGn Colormap (Sequential)" )
# Diverging Palette
palette = sns .color_palette ("coolwarm" )
sns .palplot (palette )
plt .title ("coolwarm Colormap (Diverging)" )
# Qualitative Palette
palette = sns .color_palette ("Pastel2" )
sns .palplot (palette )
plt .title ("Pastel2 Colormap (Qualitative)" )
data = sns .load_dataset ("iris" )
plt .figure (figsize = (11 , 3 ))
plt .subplot (121 )
sns .lineplot (x = "sepal_length" , y = "sepal_width" , data = data )
plt .subplot (122 )
sns .lineplot (x = "petal_length" , y = "petal_width" , data = data )
grid = sns .FacetGrid (data , col = "species" )
grid .map (plt .plot , "sepal_width" )
x_vars = ["sepal_length" , "sepal_width" , "petal_length" , "petal_width" ]
y_vars = ["species" ]
grid = sns .PairGrid (data , x_vars = x_vars , y_vars = y_vars )
grid .map (sns .barplot )