Skip to content

Files

Latest commit

a193f10 · Sep 21, 2021

History

History
2326 lines (1634 loc) · 46.3 KB

README.md

File metadata and controls

2326 lines (1634 loc) · 46.3 KB

Table of contents

Must KnowClassesFunctions
CollectionsItertoolsFunctools
StringIntSetTuple
ConditionalFor-LoopTry-ExceptDesignIpython
Built-ins
NumpyPandasMatplotlib (Pyplot)
Seaborn

Must Know

List & Dict & Set Comprehensions

[(i, j) for i in range(3) for j in range(3) if i > j]

# [(1, 0), (2, 0), (2, 1)]

Lambda Functions

li = [1, 2, 3]
li = [*map(lambda x: x * 10, li)]

#li = [10, 20, 30]

Map

num1 = [100, 1, 20]
num2 = [19, 4, 94]
num3 = [40, 6, 30]

[*map(lambda x, y, z: max(x, y, z), num1, num2, num3)]
# [100, 6, 94]

Filter

names = ['Liam', 'Olivia', 'Noah', 'Emma', 'Oliver', 'Ava']
choice = filter(lambda x: x.startswith('O'), names)

print(*choice, sep=', ') # Olivia, Oliver

Zip

a = [1, 2, 3]
b = [4, 5, 6]

c = [*zip(a, b)]  # [(1, 4), (2, 5), (3, 6)]
a, b = zip(*c)    # a=(1, 2, 3),  b=(4, 5, 6)

*args & **kwargs

Defining Functions with *arg and **kwarg

def example(a, *arg, b=0, **kwarg):
    print(a)     # 1
    print(arg)   # (2, 3)
    print(b)     # 1
    print(kwarg) # {'x': 'a', 'y': [1, 2, 3]}

example(1, 2, 3, b=1, x='a', y=[1, 2, 3])

Calling Functions with *arg and **kwarg

def func(greet, time, name):
    print(greet, time, name)

func(*["Good", "Morning"], **{"name": "Jay"})
# Good Morning Jay

Unpack Variables

Unpacking Iterable

a, b, *_ = [1, 2, 3, 4, 5]
# 1, 2, [3, 4, 5]

Unpacking Generator

first, *amid, last = map(lambda x: x**2, range(1, 10000))
first  # 1
last   # 99980001

Unpacking in For-loop

sales = [("Pencil", 0.22, 1500), ("Notebook", 1.30, 550)]

for product, *_ in sales:
    print(product)
    # Pencil, Notebook

Unpacking Function

def compute(i):
    return i, i ** 2, i ** 3, i ** 4, i ** 5

num, power, cube, *_ = compute(3)
power  # 9
cube   # 27

Combining Dicts

number = {"one": 1, "two": 2}
letter = {"a": "A", "b": "B"}

combine = {**number, **letter}
combine  # {'one': 1, 'two': 2, 'a': 'A', 'b': 'B'}

Generator (map, filter, zip)

def square_it(value):
    for i in range(value):
        yield i**2

li = square_it(10_000_000)

[i for i in li if i < 50]  # [0, 1, 4, 9, 16, 25, 36, 49]

Closure & Decorator

def count_decorator(count):  # new decorator with argument
    def decorator(orig_func):
        def wrapper(*args, **kwargs):
            print(f"func name: {orig_func.__name__}")
            print(f"func args: {args}, {kwargs}")

            for _ in range(count):  # use the argument
                orig_func(*args, **kwargs)

        return wrapper
    return decorator  # return the original decorator

@count_decorator(2)
def greet(msg):
    print(msg)


greet("hello")
# func name: greet
# func args: ('hello',), {}
# hello
# hello

Context Manager

@contextmanager
def enterFolder(folderName):
    home = os.getcwd()
    os.chdir(folderName)
    yield
    os.chdir(home)

with enterFolder('folder1'), open('example1.txt', 'w') as f:
    f.write('file1')

Magic Method

class BinaryInt(str):
    def __new__(cls, val):
        return str.__new__(cls, f"{val: b}")

    def __add__(self, val):
        val += int(self, 2)
        return f"{val:b}"


a = BinaryInt(2)
print(a)      # 10
print(a + 4)  # 110

Metaclasses

class Meta(type):
    def __new__(mtcls, name, bases, attrs):
        if name != "Base" and "must_to_do" not in attrs:
            raise TypeError("Bad Class: must_to_do() is needed")
        return super().__new__(mtcls, name, bases, attrs)


class Base(metaclass=Meta):
    def server_func(self):
        return self.must_to_do()


class Derived(Base):
    ...
# TypeError: Bad Class: must_to_do() is needed

Threading & Multiprocessing

import concurrent.futures

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(load_url, url, 60) for url in URLS]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        print(len(result))


with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(load_url, URLS, [60] * len(URLS), chunksize=4)
    for result in results:
        print(len(result))

Classes

self (class instance)

class Person:
    def __init__(self, name):
        self.name = name

    def say(self):
        return f"I'm {self.name}"


p = Person("Jay")
p.say() == Person.say(p)  # True

variables (class & instance)

class Employee:
    num_emp = 0  # Class variable

    def __init__(self, pay):
        self.pay = pay  # Instance variable
        Employee.num_emp += 1

e1 = Employee(100)
e2 = Employee(200)

e1.num_emp        # 2
Employee.num_emp  # 2

e1.pay  # 100
Employee.pay  # AttributeError: type object 'Employee' has no attribute 'pay'

method vs. classmethod vs. staticmethod

class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age

    @staticmethod
    def splitPersonString(string, split_sign="-"):
        return string.split(split_sign)

    @classmethod
    def fromString(cls, cls_str):
        return cls(*cls.splitPersonString(cls_str, ", "))


p1 = Person.fromString("Jay, 99")
p1.name  # Jay
p1.age   # 99

_ (private) vs. __ (name mangling)

class Dog:
    _weight = 5  # private variable

    def __bark(self):  # name mangling fucntion
        print("bark")


dog = Dog()
dog._weight       # 5
dog.__bark()      # AttributeError: 'Dog' object has no attribute '__bark'
dog._Dog__bark()  # bark

@property (getter, setter)

class User:
    def __init__(self, first_name, last_name, password):
        self.first_name = first_name
        self.last_name = last_name
        self.password = password

    @property
    def fullname(self):
        return f"{self.first_name} {self.last_name}"

    @property
    def password(self):
        raise AttributeError("password is not readable.")

    @password.setter
    def password(self, passord):
        from hashlib import md5

        self.password_hash = md5(b"{password}").hexdigest()


user = User("Mimi", "Wang", "0000")
user.fullname       # Mimi Wang
user.password_hash  # 7fbccc9c3a9a5afef65563cd00404c1416
user.password       # Attribute Error: password is not readable.

LEGB (local, enclosing, global, builtins)

min([1, 2, 31])  # builtins min
min = "global min"

def outer():
    # we can do "global min" here to change global
    min = "enclosing min"
    
    def inner():
        # we can do "nonlocal min" here to change enclosing
        min = "local min"

Abstract class

from abc import ABC, abstractmethod


class Base(ABC, object):
    @property
    @abstractmethod
    def foo(self):
        ...

    @abstractmethod
    def do(self):
        ...

Dataclasses

from dataclasses import InitVar, dataclass, field
from typing import List


@dataclass
class InventoryItem:
    name: str
    unit_price: float = field(default=0.0)
    quantity_on_hand: int = field(default=0, repr=False)
    parts: List[str] = field(default_factory=list)
    parts_number: InitVar[int] = 0

    def __post_init__(self, parts_number):
        self.parts.extend([f"part{i}" for i in range(1, parts_number + 1)])


item = InventoryItem("product", parts_number=2)
# InventoryItem (name = 'product', unit_price=0.0, parts=['part1', 'part2'])

Classes in Dynamic Language

def getClass(x):
    if x == 1:
        for i in range(11):

            class Example:
                a = i

        return Example


cls = getClass(1)
cls.b = "123"
print(cls.a, cls.b)  # 10 123

Functions

Enclosing function

def add_with_b(b):
    def add(a):
        return a + b
    return add

add4 = add_with_b(4)
add4(3)  # 7
add4(7)  # 11

Attrs

class Cat:
    def __repr__(self):
        return f"({self.name}: {self.age})"

listOfCats = []
attrs = [{"name": "meow1", "age": 5}, {"name": "meow2", "age": 10}]

for attr in attrs:
    cat = Cat()
    for key, val in attr.items():
        setattr(cat, key, val)
    listOfCats.append(cat)


print(listOfCats)
# [(meow1: 5), (meow2: 10)]

Functions in Dynamic Language

for i in range(100):
    def say():
        print(i)


def returnFunc(a):
    if a < 100:
        def mul(b):
            print(a * b)
        return mul
    else:
        def add(b):
            print(a + b)
        return add

Collections

defaultdict

from collections import defaultdict

d = defaultdict(list)
d["a"] = [1, 2, 3]
d["b"].append(4)
d["c"].extend([5, 6])

# defaultdict(<class 'list'>, {'a': [1, 2, 3], 'b': [4], 'c': [5, 6]})

OrderedDict

from collections import OrderedDict

location = ["C", "B", "A"]
population = [32, 46, 12]

d = OrderedDict({l: p for l, p in zip(location, population)})
# OrderedDict([('C', 32), ('B', 46), ('A', 12)])

d["D"] = 44
# OrderedDict([('C', 32), ('B', 46), ('A', 12), ('D', 44)])

d.popitem(last=False)
# OrderedDict([('B', 46), ('A', 12), ('D', 44)])

d.move_to_end("D", last=False)
# OrderedDict ([( 'D', 44), ('B', 46), ('A', 12)])

Counter

from collections import Counter

c = Counter(cats=4, dogs=8)
# Counter({'dogs': 8, 'cats': 4})

c.update(birds=10)
# Counter({'birds': 10, 'dogs': 8, 'cats': 4})

c = c - Counter({"birds": 5})
# Counter({'dogs': 8, 'birds': 5, 'cats': 4})

c.most_common(2)
# [('dogs', 8), ('birds', 5)]

namedtuple

from collections import namedtuple

Dog = namedtuple("Dog", "name, age")
d1 = Dog("funny", 4)

features = ["happy", 3]
d2 = Dog._make(features)
# Dog(name='happy', age=3)

d2._asdict()
# OrderedDict([('name', 'happy'), ('age', 3)])

deque

from collections import deque

li = [40, 30, 50, 46, 39, 44]
d = deque(li[:2])

# Let 's compute the moving average with range=3
d.appendleft(0)
s = sum(d)

for elem in li[2:]:
    s += elem - d.popleft()
    d.append(elem)
    print(s / 3)
    # 40, 42, 45, 43

Itertools

Infinite iterators

count

from itertools import count 

gen = count(2.5, 0.5)

for x in gen:
    print(x)
    # 2.5, 3.0, 3.5, 4.0, ... non-stop

cycle

from itertools import cycle 

gen = cycle([1, 2, 3])

for x in gen:
    print(x)
    # 1, 2, 3, 1, 2, ... non-stop

repeat

from itertools import repeat 

class Cat:
    ...
    
gen = repeat(Cat(), 2)

for cat in gen:
    print(cat)
    # <__main__.Cat object at 0x0000019AC1C5D348>
    # <__main__.Cat object at 0x0000019AC1C5D348>

Iterators terminating on the shortest input sequence

accumulate

import operator
from itertools import accumulate

gen = accumulate([1, 2, 3, 4])
list(gen)  # [1, 3, 6, 10]


gen = accumulate([1, 2, 3, 4], func=operator.mul)
list(gen)  # [1, 2, 6, 24]

chain

from itertools import chain

gen = chain([1, 2], [3, 4])
list(gen)  # [1, 2, 3, 4]


gen = chain("AB", "CD")
list(gen)  # [A, B, C, D]

compress

from itertools import compress

gen = compress([1, 2, 3], [1, 0, 1])
gen = compress([1, 2, 3], [True, False, True])  # same

list(gen)  # [1, 3]

filterfalse

from itertools import filterfalse

gen = filterfalse(lambda x: x%2 == 0, [1, 2, 3])

list(gen)  # [1, 3]

groupby

from itertools import groupby

gen = groupby("AABBCCCAA")  # default func = lambda x: x
for k, g in gen:
    print(k, list(g))
    # A [A, A]
    # B [B, B]
    # C [C, C, C]
    # A [A, A]


gen = groupby([1, 2, 3, 4], lambda x: x // 3)
for k, g in gen:
    print(k, list(g))
    # 0 [1, 2]
    # 1 [3, 4]


gen = groupby([("A", 100), ("B", 200), ("C", 600)], lambda x: x[1] > 500)
for k, g in gen:
    print(k, list(g))
    # False [(A, 100), (B, 200)]
    # True  [(C, 600)]

islice

gen = islice([1, 2, 3], 2)  # equals to A[:2]
list(gen)  # [1, 2]


gen = islice("ABCD", 2, 4)  # equals to A[2:4]
list(gen)  # [C, D]


gen = islice("ABCD", 0, None, 2)  # equals to A[::2]
list(gen)  # [A, C]

starmap

from itertools import starmap

# with only one argument
gen = starmap(lambda x: x.lower(), "ABCD")
list(gen)  # [a, b, c, d]


# with 2 arguments
gen = starmap(lambda x, y: x + y, [(1, 2), (3, 4)])
list(gen)  # [3, 7]


# with different size of arugments
gen = starmap(lambda *keys: sum(keys) / len(keys), [[3, 8, 3], [4, 2]])
list(gen)  # [4.6666667, 3.0]

takewhile

from itertools import takewhile

gen = takewhile(lambda x: x < 2, [1, 2, 3, 2, 1])
list(gen)  # [1]

gen = takewhile(lambda x: x.isupper(), "ABCdefgHIJ")
list(gen)  # [A, B, C]

dropwhile

gen = dropwhile(lambda x: x < 2, [1, 2, 3, 2, 1])
list(gen)  # [2, 3, 2, 1]


gen = dropwhile(lambda x: x.isupper(), "ABCdefgHIJ")
list(gen)  # [d, e, f, g, H, I, J]

zip_longest

from itertools import zip_longest

gen = zip_longest("ABC", ("X", "Y"))
list(gen)  # [('A', 'X'), ('B', 'Y'), ('C', None)]


gen = zip_longest("ABC", [1, 2], fillvalue=-1)
list(gen)  # [('A', 1), ('B', 2), ('C', -1)]

Combinatoric iterators

product

from itertools import product

gen = product("AB", "CD")
list(gen)  # [AC, AD, BC, BD]


gen = product("AB", repeat=2)
list(gen)  # [AA, AB, BA, BB]


gen = product("AB", "CD", repeat=2)
list(gen)
# [ACAC, ACAD, ACBC, ACBD,
#  ADAC, ADAD, ADBC, ADBD,
#  BCAC, BCAD, BCBC, BCBD,
#  BDAC, BDAD, BDBC, BDBD]

permutations

gen = permutations("ABC") # same as r=3
list(gen)  # [ABC, ACB, BAC, BCA, CAB, CBA]


gen = permutations("ABC", r=2)
list(gen)  # [AB, AC, BA, BC, CA, CB]


gen = permutations("ABC", r=1)
list(gen)  # [A, B, C]

combinations

gen = combinations("ABC", 1)
list(gen)
# [A, B, C]


gen = combinations("ABC", 2)
list(gen)
# [AB, AC, BC]


gen = combinations("ABC", 3)
list(gen)
# [ABC]

combinations_with_replacement

gen = combinations_with_replacement("ABC", 1)
list(gen)
# [A, B, C]


gen = combinations_with_replacement("ABC", 2)
list(gen)
# [AA, AB, AC, 
#  BB, BC, 
#  CC]


gen = combinations_with_replacement("ABC", 3)
list(gen)
# [AAA, AAB, AAC, ABB, ABC, ACC,
#  BBB, BBC, BCC,
#  CCC]

Functools

Reduce

from functools import reduce

reduce(lambda x, y: x - y, [1, 2, 3, 4, 5], 100)  # 85

String

f-string

first_name = "Kain"
last_name = "Mccarthy"
print(f"Hi, I'm {first_name} {last_name}.")  # Hi, I'm Kain Mccarthy.


pi = 3.14159265359
print(f"{pi:.2f}")  # 3.14


d = {"name": "Shelly"}
print(f"She is {d['name']}")  # She is Shelly


i = 1000000
print(f"{i:,}")  # 1,000,000


# Ref:
#   * https://youtu.be/nghuHvKLhJA
#   * https://blog.louie.lu/2017/08/08/outdate-python-string-format-and-fstring/

Int

Underscore Placeholders

a = 100_000_000
b =  10_000_000
c =  1_0_0

print(f"{a+b+c:,}")  # 110,000,100


# Ref:
#   * https://youtu.be/C-gEQdGVXbk&t=140

Set

Search

long_list = [i for i in range(100_000_000)]
long_set = set(long_list)

%%time
100_000_000 in long_list
# False
# Wall time: 1.26 s


%%time
100_000_000 in long_set
# False
# Wall time: 0 ns


# Ref:
#   * https://stackoverflow.com/questions/2831212/python-sets-vs-lists/17945009
#   * https://youtu.be/r3R3h5ly_8g?t=1010

Tuple

Swap

a, b = 1, 2
a  # 1
b  # 2

a, b = b, a
a  # 2
b  # 1


# Ref:
#   * https://youtu.be/VBokjWj_cEA?list=LL&t=445

Condition

Ternary operator

if x < 1:
    x += 1
else:
    x -= 1

# equivalent to:

x = (x + 1) if (x < 1) else (x - 1)


# Ref:
#   * https://www.youtube.com/watch?v=C-gEQdGVXbk&t=34s

For-Loop

Enumerate

arr = ["a", "b", "c"]

for index, element in enumerate(arr):
    print(index, element)
    # 0 a
    # 1 b
    # 2 c

for index, element in enumerate(arr, start=3):
    print(index, element)
    # 3 a
    # 4 b
    # 5 c


# Ref
#   * https://youtu.be/VBokjWj_cEA?list=LL&t=190

For-Else

for text in "to be or not to be".split():
    if text.strip().startswith("o"):
        print(f"Found it! `{text}`")
        break
else:
    print("Not found")

# Found it! `or`


# Ref:
#   * https://www.youtube.com/watch?v=Dh-0lAyc3Bc

Try-Except

TEEF

try:
    print(1/1)
except Exception as e:
    print(e)
else:
    print("Safe")  # executed when except didn't happen
finally:
    print("Done")  # Always executed

# 1.0
# Safe
# Done


# Ref:
#   * https://youtu.be/VBokjWj_cEA?list=LL&t=1331

Design

Annotation

def func(a: str, b: int = 3) -> str:
    return a*b

func.__annotations__  # {'a': <class 'str'>, 'b': <class 'int'>, 'return': <class 'str'>}

func("hi")     # hihihi
func("hi", 5)  # hihihihihi
def func(a: "str longer than 5", b: 1+2 = 3) -> "str longer b times":
    return a*b

func.__annotations__  # {'a': 'str longer than 5', 'b': 3, 'return': 'str longer b times'}

func("hi")         # hihihi
func("ohayou", 2)  # ohayouohayou

Ref

Typing

from typing import Any, Dict, Iterable, List, Union


def func(a: List[int], b: Union[str, int], c: Dict[str, int], d: Iterable, e: Any):
    print(len(a))
    
    print(f"{b} can be str or int.")
    
    print(f"{c['something']} will return int.")
    
    for i in d:
        print(i)
        
    print(f"{type(e)} can be any type.")


# Ref:
#   * https://myapollo.com.tw/zh-tw/python-typing-module/

Pass and Ellipsis

# Style 1
def my_abstract_method(self):
    pass

# Style 2
def my_abstract_method(self):
    ...

# Style 3
def my_abstract_method(self):
    """
    This function is ...
    """

# Ref:
#   * https://stackoverflow.com/questions/55274977/when-is-the-usage-of-the-python-ellipsis-to-be-preferred-over-pass
#   * https://stackoverflow.com/questions/772124/what-does-the-ellipsis-object-do

IPython

VSCode Python Interactive window

#%%
1+1
# 2


# Ref:
#   * https://code.visualstudio.com/docs/python/jupyter-support-py

Time Measure

One Line

%time sleep(0.3)  
# Wall time: 310 ms

%timeit sleep(0.3)
# 311 ms ± 2.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Multiple Lines

%%time
for i in range(10):
    sleep(0.1)
# Wall time: 1.09 s


%%timeit
for i in range(10):
    sleep(0.1)
# 1.09 s ± 2.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Ref

Memory Measure

Installation

!pip install -U memory_profiler

%load_ext memory_profiler

One Line

%memit [i for i in range(1000)]

# peak memory: 51.31 MiB, increment: 0.36 MiB

Multiple Lines

%%memit
l = []
for x in range(10000):
    l.append(x*2)

# peak memory: 52.76 MiB, increment: 0.70 MiB

Ref

Modules

pathlib

sub_folder = Path("subfolder/subfolder")
sub_folder.mkdir(parents=True, exist_ok=True)

file_ = sub_folder / Path("test.txt")
file_.touch()

file_.write_text("Hello")
file_.read_text()

file_.unlink()
Path("subfolder/subfolder").rmdir()

Numpy

Create Array or Matrix

np.array([[1, 2], [3, 4], [5, 6]])  # create from list

np.zeros((3, 3))  # create filled with 0's

np.ones((2, 4, 4))  # create filled with 1's

np.empty((5, 2))  # create with speed

np.arange(2, 10, 3)  # create array from range (start, end, step_size)

np.linspace(5, 50, 20)  # create a linear space (start, end, num_elements)

# create from random generator
rng = np.random.default_rng(seed=42)

rng.random((2, 4))

rng.normal(3, 2.5, size=(2, 4))  # sample from N(3, 6.25)

rng.integers(low=2, high=10, size=(10, 2))  # random integer matrix

Basic Operations

Sort and Concatenate

np.sort(a, axis=None)
np.sort(a, axis=-1)[::-1]
a.sort()
a[::-1].sort()

np.concatenate((a, b), axis=None)
np.concatenate((a, b), axis=2)

Element-wise

a = np.arange(5)           # [0, 1, 2, 3, 4]
b = np.ones(5, dtype=int)  # [1, 1, 1, 1, 1]

a + b       # [1 2 3 4 5]
a - b       # [-1  0  1  2  3]
a ^ 2       # [ 0  1  4  9 16]
a * 10      # [ 0 10 20 30 40]
a > 2       # [False False False  True  True]
np.sqrt(a)  # [0. , 1.  , 1.41421356, 1.73205081, 2. ]
a*b  # [0 1 2 3 4]
a@b  # 10

All (None) Column-wise (0), Row-wise (1)

A = np.random.default_rng(42).random((2, 4))
# [[0.77395605, 0.43887844, 0.85859792, 0.69736803],
#  [0.09417735, 0.97562235, 0.7611397 , 0.78606431]])

A.max()        # 0.97562235
A.max(axis=0)  # [0.77395605, 0.97562235, 0.85859792, 0.78606431]
A.max(axis=1)  # [0.85859792, 0.97562235]

A.mean()        # 0.6732255180088094
A.mean(axis=0)  # [0.4340667 , 0.7072504 , 0.80986881, 0.74171617]
A.mean(axis=1)  # [0.69220011, 0.65425093]

Indexing and Slicing

# Index and slicing arrays
x[1, 3] == x[1][3]
y[1:5:2, ::3]


# Indexing arrays
x[np.array([0, 1, 2, -1, -2])]
y[np.array([1, 2, 3]), 1:4:2]
y[np.array([1, 2]), np.array([-1, -1])]


# Masking arrays
x[x>5]
x[(x%2==0) | (x>7)]
y[[True]*3 + [False] + [True] + [False], 2::2]


# Ellipsis syntax
x[-1, ..., 3]  # same as x[-1, :, 3]
x[:3, ...]  # same as x[0:3, :, :] and x[0:3] and x[:3]
x[::2, ..., np.array([0, 2])]  # same as x[0:5:2, :, np.array([0, 2])]

Shape Manipulation

A = np.array([[[1, 2, 3], [4, 5, 6]], [[4, 6, 8], [2, 1, 6]]])

A.shape  # (2, 2, 3)

A = A.reshape(3, 2, 2)  # (3, 2, 2)

A = A[np.newaxis, ...]  # (1, 3, 2, 2)

A = np.expand_dims(A, axis=4)  # (1, 3, 2, 2, 1)

A = A.flatten()  # (12,)

A = A.reshape(2, -1, 2)  # (2, 3, 2)

Copying

# shallow copy: values will change on every variable
a = np.arange(10).reshape(5, 2)
b = a.view()
c = a.reshape(-1)
d = a[:3, :1]


# deep copy: copy and create an entirely new array
a = np.arange(10000000)
b = a[:100].copy()
del a

Broadcasting

# scalar broadcasting
a = np.array([1, 2, 3])
a * 3  # [3, 6, 9]


# general broadcasting
a =  np.ones( (8, 1, 6, 1))
b = np.zeros(    (7, 1, 5))
(a*b).shape  # 8, 7, 6, 5


# outer product
a = np.arange(4)[:, np.newaxis]  # (4, 1)
b = np.array([1, 2, 3])  # (3,)

a + b  # (4, 3)

# [0]  + [1, 2, 3] =  [1 2 3]
# [1]                 [2 3 4]
# [2]                 [3 4 5]
# [3]                 [4 5 6]

Pandas

Creation and Viewing

# Create Series
pd.Series([1, 2, 3, 4, 5])
pd.Series(np.arange(1, 6), index=list("abcde"))
pd.Series({"a": 100, "b": 50, "c": 120})
pd.Series("hi", index=list("12345"))


# Create DataFrame
pd.DataFrame({
    "col_1": [1, 2, 3, 4, 5],
    "col_2": np.arange(1, 6),
    "col_3": pd.Series(np.arange(1, 7), index=list("abc123")),
}, index=list("abcde"))

pd.DataFrame(
    [
        {"a": 1, "b": 2},
        {"b": 10, "c": 5},
        {"a": 55, "b": 489, "c": 32, "d": 590},
    ],
    index=["first", "second", "third"],
    columns=list("ab")
)

pd.DataFrame(
    np.arange(10).reshape(2, 5),
    # [[0,1,2,3,4], [5,6,7,8,9]]
    index=pd.date_range("20200101", periods=2),
    columns=list("abcde"))


# Viewing
df.head(2)
df.tail(3)
df.index
df.columns
df.to_numpy()
df.sort_index()
df.sort_values("col_name")

Selection

Single Column Multiple Columns Continuous Columns All Columns
Single Row df.loc[row, column] or
df.at[row, column]
df.loc[row, [column, column]] df.loc[row, column:column] df.loc[row]
Multiple Rows df.loc[[row, row], column] df.loc[[row, row], [column, column]] df.loc[[row, row], column:column] df.loc[[row, row]]
Continuous Rows df.loc[row:row, column] df.loc[row:row, [column, column]] df.loc[row:row, column:column] df[row:row]
All Rows df[column] df[[column, column]] or
df.loc[:, [column, column]]
df.loc[:, column:column] df
df["col1"]
df[["col1", "col2"]]
df["row1":"row5"]

df.loc["row1", "col1"]  # df.iloc[0, 0]
df.at["row1", "col1"]  # df.iat[0, 0]

df.loc["row1", ["col1", "col2"]]  # df.iloc[0, [0, 1]]
df.loc["row1", "col1":"col5"]  # df.iloc[0, 0:4]

df.loc[["row1", "row2"]]  # df.iloc[[0, 1]]
df.loc["row1":"row5", "col1"]  # df.iloc[0:4, 0]


df[(df["col1"] > 18)]
df[(df > 6) & (df < 25)]
df[df["col1"].isin([10, 15, 0])]
  • df.iloc is same as df.loc but using position.
  • df.iat is same as df.at but using position.
  • Details 🔥

Setting, Deleting, and Handling

# Modify columns
df["col1"] += 10
df.loc[:, "col1"] = "bar"
df.loc[:, ["col1", "col3"]] = np.arange(12).reshape(6, 2)

# Modify single element
df.loc["row1", "col1"] = 0
df.iloc[0, 0] = 1

# Modify by boolean indexing
df[df < 100] = -df

# Append
df["total"] = df.sum(axis=1).to_numpy()
df["gt"] = df["total"] > 50000
df["foo"] = "bar"

# Insert
df.insert(0, "col0", df["col2"][:2])  # col_index, col_name, values

# Delete column
del df["total"]
df.drop(columns=["foo"], inplace=True)  # same as `df.drop(["foo"], axis=1)`
gt50000 = df.pop("gt50000")

# Delete row
df.drop(["e", "d"], inplace=True)

# Handle NaN
miss_df.dropna(how='any')
miss_df.fillna(value=10000000)

Operations and Apply Functions

# Arithmetic
df + df2
df - df.iloc[0]
1 / df


# Numpy
np.sqrt(df)
np.max(df, axis=1)


# Built-in
df.mean()
df.max(axis=1)


# Apply
df.apply(np.cumsum, axis=1)
df.apply(lambda x: x.sum() / x.size)  # x means df


# Series
s.value_counts()
s.str.upper()
s.str.split("-").str.get(0)

Concat and Merge

# Concat rows
pd.concat([df[:3], df.iloc[7:, :2]])

# Merge two DataFrame
pd.merge(df, df2, on="name", how="right")

Grouping and Categorical Data Type

# Groupby
df.groupby("col_A").sum()
df.groupby(["col_A", "col_B"]).max()

# Categorical - discrete
df["grade"] = df["grade"].astype("category")
df["grade"].cat.categories = ["Bad", "Good", "Excellent"]
df.sort_values(by="grade")
df.groupby("grade").size()

# Categorical - continuous
df["grade-labels"] = pd.cut(df["score"], bins=range(0, 120, 20), labels=list("EDCBA"))

Other Pandas Tricks

# Rename Columns
df.columns = ["col_one", "col_two"]
df = df.add_prefix("Xx_")
df = df.add_suffix("_xX")
df.columns = df.columns.str.replace("Xx", "Oo")
df.columns = df.columns.str.replace("xX", "oO")


# Reverse Row or Column Order
df.loc[::-1].reset_index(drop=True) # reverse rows
df.loc[:, ::-1] # reverse columns


# Split DataFrame into 2 random subsets
sub1 = df.sample(frac=0.75, random_state=42)
sub2 = df.drop(sub1.index)
sub1.index = sub1.index.sort_values()
sub2.index = sub2.index.sort_values()


# Filter by Category (or Largest Category)
df[df.genre.isin(["A", "D"])]
df[~df.genre.isin(["A", "D"])]
df[df.genre.isin(df.genre.value_counts().nlargest(1).index)]


# Split String into Multiple Columns
df[["first", "last"]] = df["name"].str.split(' ', expand=True)
df["city"] = df["location"].str.split(", ", expand=True)[0]


# Change Display Options (Not Change Data)
pd.set_option("display.float_format", "${:.2f}".format)
pd.reset_option("display.float_format")


# Style a DataFrame
style = {"Date": "{:%Y/%m/%d}", "Value": "${:d}", "Volume": "{:,}"}
df.style.format(style) \
    .hide_index() \
    .highlight_max("Value", color="red") \
    .highlight_min("Value", color="green") \
    .bar("Area", color="orange", align="zero") \
    .background_gradient(subset="Volume", cmap="Greens") \
    .set_caption("Random Chart")

Matplotlib (Pyplot)

Basic (Single Plot)

import matplotlib.pyplot as plt

# with this magic function, we can skip `plt.show()`
%matplotlib inline

plt.plot(np.sin(np.linspace(0, 10, 100)), "*-b", lw=2, markersize=5, label="sin(x)")
plt.plot(np.log(np.arange(100)), c="g", ls="--", marker=".", lw=2, markersize=5, label="log(x)")

plt.xlabel("X here")
plt.ylabel("Y here")

plt.title("sin(x) and log(x)")

plt.grid()

plt.legend()

plt.text(x=70, y=-1, s="hahahaha")

plt.annotate("wow \nmax", xy=(16, 1), xytext=(40, 0.9), arrowprops={"facecolor": "orange", "shrink": 0.05})
plt.annotate("wow \nmax again", xy=(78, 1), xytext=(95, 0.9), arrowprops={"facecolor": "red", "shrink": 0.05})

Multiple Figures and Axes

# Object-oriented style
fig1, ax = plt.subplots()
ax.plot(...)

fig2, axs = plt.subplots(2, 1)
axs[0].plot(...)
axs[1].plot(...)


# Pyplot style
plt.figure(1)
plt.title("Figure 1")

plt.figure(2)
plt.subplot(311)
plt.title("Figure 2")

plt.subplot(323)
plt.subplot(324)

plt.subplot(337)
plt.subplot(338)
plt.subplot(339)

Line Plots and Filling Area

years = [1.1, 1.3, 1.5, 2.0, 2.2, ...]
salary = [39343.00, 46205.00, 37731.00, 43525.00, 39891.00, ...]
salary_mean = np.mean(salary)

# Line Plots
plt.plot(years, 
         salary,
         marker="o",
         markersize=5,
         lw=2,
         ls="-",
         )


# Filling Areas
plt.fill_between(years, 
                 salary,
                 salary_mean,
                 where=(salary > salary_mean),
                 alpha=.4,
                 color="green", 
                 edgecolor="black",
                 interpolate=True,
                 label="On Average"
                 )

Time Series

import matplotlib.dates as mdates

dates = np.arange(np.datetime64("2021-01-01"), np.datetime64("2021-01-22"))
prices = np.random.default_rng(42).normal(500, 30, len(dates))

plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%a, %d %m"))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
plt.gca().xaxis.set_minor_locator(mdates.DayLocator())

plt.plot_date(dates, prices, 
              ls="solid",
              c="orange",
              marker="^",
              markersize=10)

plt.grid()
plt.tight_layout()

Scatter Plots

temperature = [14.2, 16.4, 11.9, 15.2, ...]
ice_cream_sales = [215, 325, 185, 332, ...]
colors = np.array(ice_cream_sales) / np.linalg.norm(ice_cream_sales)

plt.scatter(temperature, ice_cream_sales, 
            s=ice_cream_sales,  # set the size according to the prices of the ice cream
            c=colors,  # set the colors according to the prices of the ice cream
            cmap="Greens",  # preferred color type
            edgecolor="black",  # the edge color of points
            lw=0.5,  # the edge width of points
            alpha=.75,
            )

plt.xlabel("temperature")
plt.ylabel("ice cream price")
plt.yscale("log")   # use log scale on y-axis to handle outliners

cbar = plt.colorbar()
cbar.set_label("Expensive")

plt.tight_layout()

Bar Charts

# Bar Charts
ages = [25, 26, 27, 28, 29, ...]
salary_all = [38496, 42000, 46752, 49320, 53200, ...]

index = np.arange(len(ages))
width = 0.25

plt.bar(index - width, salary_all, width=0.25, label="All Devs")
plt.bar(index, salary_py, width=0.25, label="Python")
plt.bar(index + width, salary_js, width=0.25, label="JavaScript")
plt.xticks(ticks=index, labels=ages)

plt.title("Median Salary (USD) by Age")
plt.xlabel("Ages")
plt.ylabel("Median Salary (USD)")
plt.legend()
plt.tight_layout()


# Horizontal Bar Charts
language = ['JavaScript', 'HTML/CSS', 'SQL', 'Python', ...]
popularity = [59219, 55466, 47544, 36443, ...]

plt.barh(language, popularity)
plt.title("Most Popular Languages")
plt.xlabel("Number of People Who Use")
plt.tight_layout()

Pie Charts

grade = ["A", "B", "C", "D", "E"]
number = [10, 18, 23, 8, 5]
explode = [0.1, 0, 0, 0, 0]

plt.pie(number, 
        labels=grade,
        shadow=True,
        autopct="%1.1f%%",
        pctdistance=0.6,
        startangle=90,
        explode=explode
        )

plt.title("Test Grade")
plt.tight_layout()

Histograms

height_stats = np.random.default_rng(42).normal(160, 15, 1000)

interval_bin = [120, 130, 140, 150, 160, 170, 180, 190, 200]
plt.hist(height_stats, bins=interval_bin,
         edgecolor="black", lw=1, density=True)

# Plot the probability density curve
import scipy.stats as ss
density = ss.kde.gaussian_kde(height_stats)
index = np.arange(120, 200)
plt.plot(index, 
         density.evaluate(index), 
         color="pink",
         lw=3,
         ls="--",
         label="Probability Density")

# Plot the mean line
plt.axvline(np.mean(height_stats), c="orange", lw=5, label="Height Mean")
plt.legend()

plt.title("Height Stats")
plt.xlabel("Heights")
plt.ylabel("Probability Density")
plt.tight_layout()

Stack Plots

years = [1950, 1960, 1970, 1980, 1990, 2000, 2010, 2018]

population_by_continent = {
    'africa': [228, 284, 365, 477, 631, 814, 1044, 1275],
    'americas': [340, 425, 519, 619, 727, 840, 943, 1006],
    'asia': [1394, 1686, 2120, 2625, 3202, 3714, 4169, 4560],
    'europe': [220, 253, 276, 295, 310, 303, 294, 293],
    'oceania': [12, 15, 19, 22, 26, 31, 36, 39],
}

y = population_by_continent.values()
labels = population_by_continent.keys()
colors = ["#96ceb4", "#ffeead", "#ff6f69", "#ffcc5c", "#88d8b0"]

plt.style.use("seaborn")
plt.stackplot(years, y, labels=labels, colors=colors)

plt.legend(loc="upper left")
plt.title("World Population")
plt.xlabel("Year")
plt.ylabel("Population (Millions)")
plt.tight_layout()

Image

img = mpimg.imread("https://www.catster.com/wp-content/uploads/1970/01/Am-ShortHair-breed_getty1140883355-768x513.png")
plt.imshow(img)

# Applying pseudocolor schemes
plt.imshow(img[..., 0], cmap="gray")
plt.colorbar()

# Flipping Photos Vertically or Horizontally
plt.imshow(img[::-1])  # Reverse at the first axis == vertical flip
plt.imshow(img[:, ::-1])  # Reverse at the second axis == horizontal flip

Styles, Colors, Colormaps

# Switch Style
plt.style.use("seaborn-pastel")

# Data
x = np.random.default_rng(42).integers(0, 100, 100)
y = (2*x+1) * np.random.default_rng(43).normal(5, 1, 100)

regr = sklearn.linear_model.LinearRegression()
regr.fit(x[:, np.newaxis], y[:, np.newaxis])
regr_line = regr.predict(x[:, np.newaxis])

# Plotting with fancy color and colormap
plt.scatter(x, y, c=y, alpha=0.25, cmap="plasma")

plt.plot(x, regr_line, 
         color="darkviolet", 
         alpha=0.5,
         lw=5, ls="-",
         label="regression line")


plt.title("Linear Regression Test")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.colorbar()

Seaborn

Basic (Seaborn)

x = np.array(range(1, 5))
y = x**2
df = pd.DataFrame(zip(x, y), columns=["col_1", "col_2"])

# Plotting with data parameter
def plot():
    sns.lineplot(x="col_1", y="col_2", data=df)

# Seaborn Styles
sns.set_style("white")

# Scaling the plots
sns.set_context("paper", font_scale=1.5)

# Changing the figure Size
plt.figure(figsize=(8, 4))  # width, height 

# Using Seaborn with Matplotlib
plt.subplot(211)
plt.title("Square X")
plot()

# Seaborn Styles Context Manager
with sns.axes_style("darkgrid"):
    plt.subplot(212)
    plot()

plt.tight_layout()

Color Palette

# Sequential Palette
palette = sns.color_palette("YlGn")
sns.palplot(palette)
plt.title("YlGn Colormap (Sequential)")

# Diverging Palette
palette = sns.color_palette("coolwarm")
sns.palplot(palette)
plt.title("coolwarm Colormap (Diverging)")

# Qualitative Palette
palette = sns.color_palette("Pastel2")
sns.palplot(palette)
plt.title("Pastel2 Colormap (Qualitative)")



Multiple Plots

Using Matplotlib

data = sns.load_dataset("iris")

plt.figure(figsize=(11, 3))
plt.subplot(121)
sns.lineplot(x="sepal_length", y="sepal_width", data=data)

plt.subplot(122)
sns.lineplot(x="petal_length", y="petal_width", data=data)

Using Seaborn

FacetGrid

grid = sns.FacetGrid(data, col="species")
grid.map(plt.plot, "sepal_width")

PairGrid

x_vars = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
y_vars = ["species"]

grid = sns.PairGrid(data, x_vars=x_vars, y_vars=y_vars)
grid.map(sns.barplot)