/* * Just-In-Time compiler for BPF filters on 32bit ARM * * Copyright (c) 2011 Mircea Gherzan * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; version 2 of the License. */ #include #include #include #include #include #include #include #include #include #include #include #include "bpf_jit_32.h" /* * ABI: * * r0 scratch register * r4 BPF register A * r5 BPF register X * r6 pointer to the skb * r7 skb->data * r8 skb_headlen(skb) */ #define r_scratch ARM_R0 /* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */ #define r_off ARM_R1 #define r_A ARM_R4 #define r_X ARM_R5 #define r_skb ARM_R6 #define r_skb_data ARM_R7 #define r_skb_hl ARM_R8 #define SCRATCH_SP_OFFSET 0 #define SCRATCH_OFF(k) (SCRATCH_SP_OFFSET + 4 * (k)) #define SEEN_MEM ((1 << BPF_MEMWORDS) - 1) #define SEEN_MEM_WORD(k) (1 << (k)) #define SEEN_X (1 << BPF_MEMWORDS) #define SEEN_CALL (1 << (BPF_MEMWORDS + 1)) #define SEEN_SKB (1 << (BPF_MEMWORDS + 2)) #define SEEN_DATA (1 << (BPF_MEMWORDS + 3)) #define FLAG_NEED_X_RESET (1 << 0) struct jit_ctx { const struct sk_filter *skf; unsigned idx; unsigned prologue_bytes; int ret0_fp_idx; u32 seen; u32 flags; u32 *offsets; u32 *target; #if __LINUX_ARM_ARCH__ < 7 u16 epilogue_bytes; u16 imm_count; u32 *imms; #endif }; int bpf_jit_enable __read_mostly; static u64 jit_get_skb_b(struct sk_buff *skb, unsigned offset) { u8 ret; int err; err = skb_copy_bits(skb, offset, &ret, 1); return (u64)err << 32 | ret; } static u64 jit_get_skb_h(struct sk_buff *skb, unsigned offset) { u16 ret; int err; err = skb_copy_bits(skb, offset, &ret, 2); return (u64)err << 32 | ntohs(ret); } static u64 jit_get_skb_w(struct sk_buff *skb, unsigned offset) { u32 ret; int err; err = skb_copy_bits(skb, offset, &ret, 4); return (u64)err << 32 | ntohl(ret); } /* * Wrapper that handles both OABI and EABI and assures Thumb2 interworking * (where the assembly routines like __aeabi_uidiv could cause problems). */ static u32 jit_udiv(u32 dividend, u32 divisor) { return dividend / divisor; } static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) { if (ctx->target != NULL) ctx->target[ctx->idx] = inst | (cond << 28); ctx->idx++; } /* * Emit an instruction that will be executed unconditionally. */ static inline void emit(u32 inst, struct jit_ctx *ctx) { _emit(ARM_COND_AL, inst, ctx); } static u16 saved_regs(struct jit_ctx *ctx) { u16 ret = 0; if ((ctx->skf->len > 1) || (ctx->skf->insns[0].code == BPF_S_RET_A)) ret |= 1 << r_A; #ifdef CONFIG_FRAME_POINTER ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC); #else if (ctx->seen & SEEN_CALL) ret |= 1 << ARM_LR; #endif if (ctx->seen & (SEEN_DATA | SEEN_SKB)) ret |= 1 << r_skb; if (ctx->seen & SEEN_DATA) ret |= (1 << r_skb_data) | (1 << r_skb_hl); if (ctx->seen & SEEN_X) ret |= 1 << r_X; return ret; } static inline int mem_words_used(struct jit_ctx *ctx) { /* yes, we do waste some stack space IF there are "holes" in the set" */ return fls(ctx->seen & SEEN_MEM); } static inline bool is_load_to_a(u16 inst) { switch (inst) { case BPF_S_LD_W_LEN: case BPF_S_LD_W_ABS: case BPF_S_LD_H_ABS: case BPF_S_LD_B_ABS: case BPF_S_ANC_CPU: case BPF_S_ANC_IFINDEX: case BPF_S_ANC_MARK: case BPF_S_ANC_PROTOCOL: case BPF_S_ANC_RXHASH: case BPF_S_ANC_VLAN_TAG: case BPF_S_ANC_VLAN_TAG_PRESENT: case BPF_S_ANC_QUEUE: return true; default: return false; } } static void build_prologue(struct jit_ctx *ctx) { u16 reg_set = saved_regs(ctx); u16 first_inst = ctx->skf->insns[0].code; u16 off; #ifdef CONFIG_FRAME_POINTER emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx); emit(ARM_PUSH(reg_set), ctx); emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); #else if (reg_set) emit(ARM_PUSH(reg_set), ctx); #endif if (ctx->seen & (SEEN_DATA | SEEN_SKB)) emit(ARM_MOV_R(r_skb, ARM_R0), ctx); if (ctx->seen & SEEN_DATA) { off = offsetof(struct sk_buff, data); emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx); /* headlen = len - data_len */ off = offsetof(struct sk_buff, len); emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx); off = offsetof(struct sk_buff, data_len); emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx); } if (ctx->flags & FLAG_NEED_X_RESET) emit(ARM_MOV_I(r_X, 0), ctx); /* do not leak kernel data to userspace */ if ((first_inst != BPF_S_RET_K) && !(is_load_to_a(first_inst))) emit(ARM_MOV_I(r_A, 0), ctx); /* stack space for the BPF_MEM words */ if (ctx->seen & SEEN_MEM) emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); } static void build_epilogue(struct jit_ctx *ctx) { u16 reg_set = saved_regs(ctx); if (ctx->seen & SEEN_MEM) emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); reg_set &= ~(1 << ARM_LR); #ifdef CONFIG_FRAME_POINTER /* the first instruction of the prologue was: mov ip, sp */ reg_set &= ~(1 << ARM_IP); reg_set |= (1 << ARM_SP); emit(ARM_LDM(ARM_SP, reg_set), ctx); #else if (reg_set) { if (ctx->seen & SEEN_CALL) reg_set |= 1 << ARM_PC; emit(ARM_POP(reg_set), ctx); } if (!(ctx->seen & SEEN_CALL)) emit(ARM_BX(ARM_LR), ctx); #endif } static int16_t imm8m(u32 x) { u32 rot; for (rot = 0; rot < 16; rot++) if ((x & ~ror32(0xff, 2 * rot)) == 0) return rol32(x, 2 * rot) | (rot << 8); return -1; } #if __LINUX_ARM_ARCH__ < 7 static u16 imm_offset(u32 k, struct jit_ctx *ctx) { unsigned i = 0, offset; u16 imm; /* on the "fake" run we just count them (duplicates included) */ if (ctx->target == NULL) { ctx->imm_count++; return 0; } while ((i < ctx->imm_count) && ctx->imms[i]) { if (ctx->imms[i] == k) break; i++; } if (ctx->imms[i] == 0) ctx->imms[i] = k; /* constants go just after the epilogue */ offset = ctx->offsets[ctx->skf->len]; offset += ctx->prologue_bytes; offset += ctx->epilogue_bytes; offset += i * 4; ctx->target[offset / 4] = k; /* PC in ARM mode == address of the instruction + 8 */ imm = offset - (8 + ctx->idx * 4); return imm; } #endif /* __LINUX_ARM_ARCH__ */ /* * Move an immediate that's not an imm8m to a core register. */ static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx) { #if __LINUX_ARM_ARCH__ < 7 emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx); #else emit(ARM_MOVW(rd, val & 0xffff), ctx); if (val > 0xffff) emit(ARM_MOVT(rd, val >> 16), ctx); #endif } static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx) { int imm12 = imm8m(val); if (imm12 >= 0) emit(ARM_MOV_I(rd, imm12), ctx); else emit_mov_i_no8m(rd, val, ctx); } #if __LINUX_ARM_ARCH__ < 6 static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) { _emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx); _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx); _emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx); _emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx); _emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx); _emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx); _emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx); } static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) { _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx); _emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx); } static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx) { emit(ARM_LSL_R(ARM_R1, r_src, 8), ctx); emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSL, 8), ctx); emit(ARM_LSL_I(r_dst, r_dst, 8), ctx); emit(ARM_LSL_R(r_dst, r_dst, 8), ctx); } #else /* ARMv6+ */ static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) { _emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx); #ifdef __LITTLE_ENDIAN _emit(cond, ARM_REV(r_res, r_res), ctx); #endif } static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) { _emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx); #ifdef __LITTLE_ENDIAN _emit(cond, ARM_REV16(r_res, r_res), ctx); #endif } static inline void emit_swap16(u8 r_dst __maybe_unused, u8 r_src __maybe_unused,