PythonPandas 用户警告: 因为非连接轴未对齐而进行排序

我做了一些代码实践和应用合并的数据框架,同时这样做 获得用户警告

/usr/lib64/python2.7/site-package/anda/core/frame.py: 6201: FuturePolice: 因为非连接轴未对齐而进行排序 大熊猫的排序将改为默认不排序。 要接受将来的行为,传递‘ sort = True’。 若要保留当前行为并使警告静默,请传递 sort = False

关于这几行代码: 请您帮助获得此警告的解决方案。

placement_video = [self.read_sql_vdx_summary, self.read_sql_video_km]
placement_video_summary = reduce(lambda left, right: pd.merge(left, right, on='PLACEMENT', sort=False), placement_video)




placement_by_video = placement_video_summary.loc[:, ["PLACEMENT", "PLACEMENT_NAME", "COST_TYPE", "PRODUCT",
"VIDEONAME", "VIEW0", "VIEW25", "VIEW50", "VIEW75",
"VIEW100",
"ENG0", "ENG25", "ENG50", "ENG75", "ENG100", "DPE0",
"DPE25",
"DPE50", "DPE75", "DPE100"]]


# print (placement_by_video)


placement_by_video["Placement# Name"] = placement_by_video[["PLACEMENT",
"PLACEMENT_NAME"]].apply(lambda x: ".".join(x),
axis=1)


placement_by_video_new = placement_by_video.loc[:,
["PLACEMENT", "Placement# Name", "COST_TYPE", "PRODUCT", "VIDEONAME",
"VIEW0", "VIEW25", "VIEW50", "VIEW75", "VIEW100",
"ENG0", "ENG25", "ENG50", "ENG75", "ENG100", "DPE0", "DPE25",
"DPE50", "DPE75", "DPE100"]]


placement_by_km_video = [placement_by_video_new, self.read_sql_km_for_video]
placement_by_km_video_summary = reduce(lambda left, right: pd.merge(left, right, on=['PLACEMENT', 'PRODUCT'], sort=False),
placement_by_km_video)


#print (list(placement_by_km_video_summary))
#print(placement_by_km_video_summary)
#exit()
# print(placement_by_video_new)
"""Conditions for 25%view"""
mask17 = placement_by_km_video_summary["PRODUCT"].isin(['Display', 'Mobile'])
mask18 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM", "CPCV"])
mask19 = placement_by_km_video_summary["PRODUCT"].isin(["InStream"])
mask20 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM", "CPE+", "CPCV"])
mask_video_video_completions = placement_by_km_video_summary["COST_TYPE"].isin(["CPCV"])
mask21 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE+"])
mask22 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM"])
mask23 = placement_by_km_video_summary["PRODUCT"].isin(['Display', 'Mobile', 'InStream'])
mask24 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM", "CPE+"])


choice25video_eng = placement_by_km_video_summary["ENG25"]
choice25video_vwr = placement_by_km_video_summary["VIEW25"]
choice25video_deep = placement_by_km_video_summary["DPE25"]


placement_by_km_video_summary["25_pc_video"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice25video_eng, choice25video_vwr, choice25video_deep])




"""Conditions for 50%view"""
choice50video_eng = placement_by_km_video_summary["ENG50"]
choice50video_vwr = placement_by_km_video_summary["VIEW50"]
choice50video_deep = placement_by_km_video_summary["DPE50"]


placement_by_km_video_summary["50_pc_video"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice50video_eng,
choice50video_vwr, choice50video_deep])


"""Conditions for 75%view"""


choice75video_eng = placement_by_km_video_summary["ENG75"]
choice75video_vwr = placement_by_km_video_summary["VIEW75"]
choice75video_deep = placement_by_km_video_summary["DPE75"]


placement_by_km_video_summary["75_pc_video"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice75video_eng,
choice75video_vwr,
choice75video_deep])


"""Conditions for 100%view"""


choice100video_eng = placement_by_km_video_summary["ENG100"]
choice100video_vwr = placement_by_km_video_summary["VIEW100"]
choice100video_deep = placement_by_km_video_summary["DPE100"]
choicecompletions = placement_by_km_video_summary['COMPLETIONS']


placement_by_km_video_summary["100_pc_video"] = np.select([mask17 & mask22, mask19 & mask24, mask17 & mask21, mask23 & mask_video_video_completions],
[choice100video_eng, choice100video_vwr, choice100video_deep, choicecompletions])






"""conditions for 0%view"""


choice0video_eng = placement_by_km_video_summary["ENG0"]
choice0video_vwr = placement_by_km_video_summary["VIEW0"]
choice0video_deep = placement_by_km_video_summary["DPE0"]


placement_by_km_video_summary["Views"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice0video_eng,
choice0video_vwr,
choice0video_deep])




#print (placement_by_km_video_summary)
#exit()


#final Table


placement_by_video_summary = placement_by_km_video_summary.loc[:,
["PLACEMENT", "Placement# Name", "PRODUCT", "VIDEONAME", "COST_TYPE",
"Views", "25_pc_video", "50_pc_video", "75_pc_video","100_pc_video",
"ENGAGEMENTS","IMPRESSIONS", "DPEENGAMENTS"]]


#placement_by_km_video = [placement_by_video_summary, self.read_sql_km_for_video]
#placement_by_km_video_summary = reduce(lambda left, right: pd.merge(left, right, on=['PLACEMENT', 'PRODUCT']),
#placement_by_km_video)




#print(placement_by_video_summary)
#exit()
# dup_col =["IMPRESSIONS","ENGAGEMENTS","DPEENGAMENTS"]


# placement_by_video_summary.loc[placement_by_video_summary.duplicated(dup_col),dup_col] = np.nan


# print ("Dhar",placement_by_video_summary)


'''adding views based on conditions'''
#filter maximum value from videos


placement_by_video_summary_new = placement_by_km_video_summary.loc[
placement_by_km_video_summary.reset_index().groupby(['PLACEMENT', 'PRODUCT'])['Views'].idxmax()]
#print (placement_by_video_summary_new)
#exit()
# print (placement_by_video_summary_new)
# mask22 = (placement_by_video_summary_new.PRODUCT.str.upper ()=='DISPLAY') & (placement_by_video_summary_new.COST_TYPE=='CPE')


placement_by_video_summary_new.loc[mask17 & mask18, 'Views'] = placement_by_video_summary_new['ENGAGEMENTS']
placement_by_video_summary_new.loc[mask19 & mask20, 'Views'] = placement_by_video_summary_new['IMPRESSIONS']
placement_by_video_summary_new.loc[mask17 & mask21, 'Views'] = placement_by_video_summary_new['DPEENGAMENTS']


#print (placement_by_video_summary_new)
#exit()
placement_by_video_summary = placement_by_video_summary.drop(placement_by_video_summary_new.index).append(
placement_by_video_summary_new).sort_index()


placement_by_video_summary["Video Completion Rate"] = placement_by_video_summary["100_pc_video"] / \
placement_by_video_summary["Views"]


placement_by_video_final = placement_by_video_summary.loc[:,
["Placement# Name", "PRODUCT", "VIDEONAME", "Views",
"25_pc_video", "50_pc_video", "75_pc_video", "100_pc_video",
"Video Completion Rate"]]
121733 次浏览

tl;dr:

concat and append currently sort the non-concatenation index (e.g. columns if you're adding rows) if the columns don't match. In pandas 0.23 this started generating a warning; pass the parameter sort=True to silence it. In the future the default will change to not sort, so it's best to specify either sort=True or False now, or better yet ensure that your non-concatenation indices match.


The warning is new in pandas 0.23.0:

In a future version of pandas pandas.concat() and DataFrame.append() will no longer sort the non-concatenation axis when it is not already aligned. The current behavior is the same as the previous (sorting), but now a warning is issued when sort is not specified and the non-concatenation axis is not aligned, link.

More information from linked very old github issue, comment by smcinerney :

When concat'ing DataFrames, the column names get alphanumerically sorted if there are any differences between them. If they're identical across DataFrames, they don't get sorted.

This sort is undocumented and unwanted. Certainly the default behavior should be no-sort.

After some time the parameter sort was implemented in pandas.concat and DataFrame.append:

sort : boolean, default None

Sort non-concatenation axis if it is not already aligned when join is 'outer'. The current default of sorting is deprecated and will change to not-sorting in a future version of pandas.

Explicitly pass sort=True to silence the warning and sort. Explicitly pass sort=False to silence the warning and not sort.

This has no effect when join='inner', which already preserves the order of the non-concatenation axis.

So if both DataFrames have the same columns in the same order, there is no warning and no sorting:

df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8]}, columns=['a', 'b'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3]}, columns=['a', 'b'])


print (pd.concat([df1, df2]))
a  b
0  1  0
1  2  8
0  4  7
1  5  3


df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8]}, columns=['b', 'a'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3]}, columns=['b', 'a'])


print (pd.concat([df1, df2]))
b  a
0  0  1
1  8  2
0  7  4
1  3  5

But if the DataFrames have different columns, or the same columns in a different order, pandas returns a warning if no parameter sort is explicitly set (sort=None is the default value):

df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8]}, columns=['b', 'a'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3]}, columns=['a', 'b'])


print (pd.concat([df1, df2]))

FutureWarning: Sorting because non-concatenation axis is not aligned.

   a  b
0  1  0
1  2  8
0  4  7
1  5  3


print (pd.concat([df1, df2], sort=True))
a  b
0  1  0
1  2  8
0  4  7
1  5  3


print (pd.concat([df1, df2], sort=False))
b  a
0  0  1
1  8  2
0  7  4
1  3  5

If the DataFrames have different columns, but the first columns are aligned - they will be correctly assigned to each other (columns a and b from df1 with a and b from df2 in the example below) because they exist in both. For other columns that exist in one but not both DataFrames, missing values are created.

Lastly, if you pass sort=True, columns are sorted alphanumerically. If sort=False and the second DafaFrame has columns that are not in the first, they are appended to the end with no sorting:

df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8], 'e':[5, 0]},
columns=['b', 'a','e'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3], 'c':[2, 8], 'd':[7, 0]},
columns=['c','b','a','d'])


print (pd.concat([df1, df2]))

FutureWarning: Sorting because non-concatenation axis is not aligned.

   a  b    c    d    e
0  1  0  NaN  NaN  5.0
1  2  8  NaN  NaN  0.0
0  4  7  2.0  7.0  NaN
1  5  3  8.0  0.0  NaN


print (pd.concat([df1, df2], sort=True))
a  b    c    d    e
0  1  0  NaN  NaN  5.0
1  2  8  NaN  NaN  0.0
0  4  7  2.0  7.0  NaN
1  5  3  8.0  0.0  NaN


print (pd.concat([df1, df2], sort=False))


b  a    e    c    d
0  0  1  5.0  NaN  NaN
1  8  2  0.0  NaN  NaN
0  7  4  NaN  2.0  7.0
1  3  5  NaN  8.0  0.0

In your code:

placement_by_video_summary = placement_by_video_summary.drop(placement_by_video_summary_new.index)
.append(placement_by_video_summary_new, sort=True)
.sort_index()

jezrael's answer is good, but did not answer a question I had: Will getting the "sort" flag wrong mess up my data in any way? The answer is apparently "no", you are fine either way.

from pandas import DataFrame, concat


a = DataFrame([{'a':1,      'c':2,'d':3      }])
b = DataFrame([{'a':4,'b':5,      'd':6,'e':7}])


>>> concat([a,b],sort=False)
a    c  d    b    e
0  1  2.0  3  NaN  NaN
0  4  NaN  6  5.0  7.0


>>> concat([a,b],sort=True)
a    b    c  d    e
0  1  NaN  2.0  3  NaN
0  4  5.0  NaN  6  7.0