import pandas as pd
import numpy as np
DataFrame
是一种二维标记数据结构,具有可能不同类型的列。您可以将其视为电子表格或 SQL 表,或 Series
对象的字典。它通常是最常用的 pandas 对象。
除了数据,您还可以选择传递索引(行标签)和列(列标签)参数。如果您传递索引和/或列,则保证结果 DataFrame
的索引和/或列。因此,Series
的字典加上特定的索引将丢弃所有与传递的索引不匹配的数据。
创建 DataFrame 对象
与 Series
一样,DataFrame
接受许多不同类型的输入:
以 Series 或 dict 为值的 dict
data = {
"one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
"two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
结果索引将是各种系列的索引的并集。如果有任何嵌套的 dict
,这些将首先转换为 Series
。如果未传递任何列,则这些列将是 dict
键的有序列表。
|
one |
two |
a |
1.0 |
1.0 |
b |
2.0 |
2.0 |
c |
3.0 |
3.0 |
d |
NaN |
4.0 |
pd.DataFrame(data, index=['d', 'b', 'a'])
|
one |
two |
d |
NaN |
4.0 |
b |
2.0 |
2.0 |
a |
1.0 |
1.0 |
pd.DataFrame(data, index=['d', 'b', 'a'], columns=['two', 'three'])
|
two |
three |
d |
4.0 |
NaN |
b |
2.0 |
NaN |
a |
1.0 |
NaN |
以 list 或 ndarray 为值的 dict
ndarrays
必须都是相同的长度。如果传递了索引,它显然也必须与数组的长度相同。如果没有传递索引,则结果将是 range(n)
,其中 n
是数组长度。
data = {
'one': [1.0, 2.0, 3.0, 4.0],
'two': [4.0, 3.0, 2.0, 1.0],
}
|
one |
two |
0 |
1.0 |
4.0 |
1 |
2.0 |
3.0 |
2 |
3.0 |
2.0 |
3 |
4.0 |
1.0 |
pd.DataFrame(data, index=['a', 'b', 'c', 'd'])
|
one |
two |
a |
1.0 |
4.0 |
b |
2.0 |
3.0 |
c |
3.0 |
2.0 |
d |
4.0 |
1.0 |
结构化或记录数组
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data
array([(0, 0., b''), (0, 0., b'')],
dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])
data[:] = [
(1, 2.0, 'Hello'),
(2, 3.0, 'World'),
]
data
array([(1, 2., b'Hello'), (2, 3., b'World')],
dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])
|
A |
B |
C |
0 |
1 |
2.0 |
b'Hello' |
1 |
2 |
3.0 |
b'World' |
pd.DataFrame(data, index=['first', 'second'])
|
A |
B |
C |
first |
1 |
2.0 |
b'Hello' |
second |
2 |
3.0 |
b'World' |
pd.DataFrame(data, columns=['C', 'A', 'B'])
|
C |
A |
B |
0 |
b'Hello' |
1 |
2.0 |
1 |
b'World' |
2 |
3.0 |
由 dict 组成的 list
data = [
{'a': 1, 'b': 2},
{'a': 5, 'b': 10, 'c': 20},
]
|
a |
b |
c |
0 |
1 |
2 |
NaN |
1 |
5 |
10 |
20.0 |
pd.DataFrame(data, index=['first', 'second'])
|
a |
b |
c |
first |
1 |
2 |
NaN |
second |
5 |
10 |
20.0 |
pd.DataFrame(data, columns=['a', 'b'])
以 tuple 为键的 dict
这样创建出的 DataFrame
具有多重索引框架。
df = pd.DataFrame(
{
('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10},
}
)
df
|
|
a |
b |
|
|
b |
a |
c |
a |
b |
A |
B |
1.0 |
4.0 |
5.0 |
8.0 |
10.0 |
C |
2.0 |
3.0 |
6.0 |
7.0 |
NaN |
D |
NaN |
NaN |
NaN |
NaN |
9.0 |
索引 'a'
和 'b'
返回的还是 DataFrame 对象,它是嵌套在 df2
里面的。
|
|
b |
a |
c |
A |
B |
1.0 |
4.0 |
5.0 |
C |
2.0 |
3.0 |
6.0 |
D |
NaN |
NaN |
NaN |
|
|
a |
b |
A |
B |
8.0 |
10.0 |
C |
7.0 |
NaN |
D |
NaN |
9.0 |
namedtuple 序列
namedtuple
的字段名称决定了 DataFrame
的列。剩下的命名元组(或元组)被简单地解包,它们的值被输入到 DataFrame
的行中。
from collections import namedtuple
Point = namedtuple('Point', 'x y')
pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
如果这些元组中的任何一个比第一个 namedtuple
短,则相应行中后面的列被标记为缺失值。如果有任何比第一个命名元组长,则会引发 ValueError。
Point3D = namedtuple('Point3D', 'x y z')
pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)])
|
x |
y |
z |
0 |
0 |
0 |
0.0 |
1 |
0 |
3 |
5.0 |
2 |
2 |
3 |
NaN |
dataclass 序列
from dataclasses import make_dataclass
Point = make_dataclass('Point', [('x', int), ('y', int)])
pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
备用构造函数
pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))
如果参数 orient='index'
,键将是行标签。在这种情况下,您还可以传递所需的列名:
pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data
array([(0, 0., b''), (0, 0., b'')],
dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])
pd.DataFrame.from_records(data, index='C')
|
A |
B |
C |
|
|
b'' |
0 |
0.0 |
b'' |
0 |
0.0 |
DataFrame 对象的属性
data = {
"one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
"two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(data)
df
|
one |
two |
a |
1.0 |
1.0 |
b |
2.0 |
2.0 |
c |
3.0 |
3.0 |
d |
NaN |
4.0 |
Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['one', 'two'], dtype='object')
类似 NumPy ndarray
,可以对 DataFrame
对象进行转置:
|
a |
b |
c |
d |
one |
1.0 |
2.0 |
3.0 |
NaN |
two |
1.0 |
2.0 |
3.0 |
4.0 |
注意:DataFrame
并非完全像二维 NumPy ndarray
那样工作。
DataFrame 列操作
data = {
"one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
"two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(data)
df
|
one |
two |
a |
1.0 |
1.0 |
b |
2.0 |
2.0 |
c |
3.0 |
3.0 |
d |
NaN |
4.0 |
可以在语义上认为 DataFrame
对象是索引 Series
对象的字典。因此,对 DataFrame
对象进行列操作和对 dict
对象进行增删改查等操作类似。
选择
a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
运算
df['three'] = df['one'] * df['two']
df
|
one |
two |
three |
a |
1.0 |
1.0 |
1.0 |
b |
2.0 |
2.0 |
4.0 |
c |
3.0 |
3.0 |
9.0 |
d |
NaN |
4.0 |
NaN |
df['flag'] = df['one'] > 2
df
|
one |
two |
three |
flag |
a |
1.0 |
1.0 |
1.0 |
False |
b |
2.0 |
2.0 |
4.0 |
False |
c |
3.0 |
3.0 |
9.0 |
True |
d |
NaN |
4.0 |
NaN |
False |
删除
|
one |
three |
flag |
a |
1.0 |
1.0 |
False |
b |
2.0 |
4.0 |
False |
c |
3.0 |
9.0 |
True |
d |
NaN |
NaN |
False |
three = df.pop('three')
three
a 1.0
b 4.0
c 9.0
d NaN
Name: three, dtype: float64
|
one |
flag |
a |
1.0 |
False |
b |
2.0 |
False |
c |
3.0 |
True |
d |
NaN |
False |
插入
|
one |
flag |
foo |
a |
1.0 |
False |
bar |
b |
2.0 |
False |
bar |
c |
3.0 |
True |
bar |
d |
NaN |
False |
bar |
当插入一个与 DataFrame
没有相同索引的 Series
时,它将符合 DataFrame
的索引:
df["one_trunc"] = df["one"][:2]
df
|
one |
flag |
foo |
one_trunc |
a |
1.0 |
False |
bar |
1.0 |
b |
2.0 |
False |
bar |
2.0 |
c |
3.0 |
True |
bar |
NaN |
d |
NaN |
False |
bar |
NaN |
默认情况下,列在最后插入。插入函数可用于在列中的特定位置插入:
df.insert(1, "bar", df["one"])
df
|
one |
bar |
flag |
foo |
one_trunc |
a |
1.0 |
1.0 |
False |
bar |
1.0 |
b |
2.0 |
2.0 |
False |
bar |
2.0 |
c |
3.0 |
3.0 |
True |
bar |
NaN |
d |
NaN |
NaN |
False |
bar |
NaN |