In [3]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
schema = StructType(
[
StructField("Name", StringType(), True),
StructField("Age", IntegerType(), True),
StructField("Gender", StringType(), True),
]
)
df = spark.createDataFrame([ ['Raju', 24, 'Male'], ['Sita', 26, 'Female'], ['Andrew', 25, 'Male'] ]* 1000000 , schema=schema)
In [4]:
df.show()
In [5]:
%%time
def square(x):
return x**2
square_udf_float = udf(lambda z: square(z), IntegerType())
df.select('Name','Age', square_udf_float('Age').alias('Age_squared')).collect()
In [6]:
%%time
from pyspark.sql.functions import PandasUDFType, pandas_udf
def square_ages(ages):
return ages**2
@pandas_udf('integer', returnType=PandasUDFType.SCALAR)
def pandas_udf_square(ages):
return square_ages(ages)
df.select('Name','Age', pandas_udf_square('Age').alias('Age_squared')).collect()
comments powered by Disqus